hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 4e47d429b45c0431b75c34804eaded1029a35e74
parent daff63ea85e347be3fde00292d4ff7f5a433a737
Author: Sebastian <sebastian@sebsite.pw>
Date:   Mon, 27 Nov 2023 01:18:05 -0500

encoding::utf8: operate exclusively on byte slices

Previously, utf8::decode and utf8::valid accepted (str | []u8). This has
been changed, so they now only accept []u8.

utf8::decode with a string operand duplicates the functionality of
strings::iter.

utf8::valid only makes sense on byte slices, since str can't hold
invalid UTF-8. If code really wants to check if the contents of a string
are valid, it's simple enough to just call strings::toutf8 on the str
first.

Signed-off-by: Sebastian <sebastian@sebsite.pw>

Diffstat:
Mencoding/utf8/decode.ha | 20++++----------------
Mhare/parse/+test/loc.ha | 15++++-----------
Mstrings/iter.ha | 4++--
Mstrings/utf8.ha | 2+-
Mtypes/c/strings.ha | 6++++--
5 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/encoding/utf8/decode.ha b/encoding/utf8/decode.ha @@ -1,23 +1,14 @@ // SPDX-License-Identifier: MPL-2.0 // (c) Hare authors <https://harelang.org> -fn toutf8(in: str) []u8 = *(&in: *[]u8); - -fn fromtagged(in: (str | []u8)) []u8 = match (in) { -case let s: str => - return toutf8(s); -case let b: []u8 => - return b; -}; - export type decoder = struct { offs: size, src: []u8, }; // Initializes a new UTF-8 decoder. -export fn decode(src: (str | []u8)) decoder = decoder { - src = fromtagged(src), +export fn decode(src: []u8) decoder = decoder { + src = src, offs = 0, }; @@ -161,11 +152,8 @@ export fn prev(d: *decoder) (rune | void | more | invalid) = { assert(prev(&decoder) is invalid); }; -// Returns true if a given string or byte slice contains only valid UTF-8 -// sequences. Note that Hare strings (str) are always valid UTF-8 - if this -// returns false for a str type, something funny is going on. -export fn valid(src: (str | []u8)) bool = { - let src = fromtagged(src); +// Returns true if a given byte slice contains only valid UTF-8 sequences. +export fn valid(src: []u8) bool = { let state = 0; for (let i = 0z; i < len(src) && state >= 0; i += 1) { state = table[state][src[i]]; diff --git a/hare/parse/+test/loc.ha b/hare/parse/+test/loc.ha @@ -2,7 +2,6 @@ // (c) Hare authors <https://harelang.org> use bufio; -use encoding::utf8; use fmt; use hare::ast; use hare::lex; @@ -26,11 +25,8 @@ fn expr_testloc(srcs: str...) void = for (let i = 0z; i < len(srcs); i += 1) { }; defer ast::expr_finish(&exp); let runes = 0z; - let d = utf8::decode(srcs[i]); - for (true) match (utf8::next(&d)!) { - case void => - break; - case rune => + let it = strings::iter(srcs[i]); + for (strings::next(&it) is rune) { runes += 1; }; assert(exp.start.line == 1 && exp.start.col == 1); @@ -117,11 +113,8 @@ fn type_testloc(srcs: str...) void = for (let i = 0z; i < len(srcs); i += 1) { }; defer ast::type_finish(&typ); let runes = 0z; - let d = utf8::decode(srcs[i]); - for (true) match (utf8::next(&d)!) { - case void => - break; - case rune => + let it = strings::iter(srcs[i]); + for (strings::next(&it) is rune) { runes += 1; }; assert(typ.start.line == 1 && typ.start.col == 1); diff --git a/strings/iter.ha b/strings/iter.ha @@ -22,7 +22,7 @@ export type iterator = struct { // strings::next(&dup); // '!' // strings::next(&dup); // void export fn iter(src: str) iterator = iterator { - dec = utf8::decode(src), + dec = utf8::decode(toutf8(src)), reverse = false, }; @@ -30,7 +30,7 @@ export fn iter(src: str) iterator = iterator { // backwards with each call to [[next]]. export fn riter(src: str) iterator = { let ret = iterator { - dec = utf8::decode(src), + dec = utf8::decode(toutf8(src)), reverse = true, }; ret.dec.offs = len(src); diff --git a/strings/utf8.ha b/strings/utf8.ha @@ -21,7 +21,7 @@ export fn fromutf8_unsafe(in: []u8) str = { // [[encoding::utf8::invalid]] is returned instead. export fn fromutf8(in: []u8) (str | utf8::invalid) = { let s = fromutf8_unsafe(in); - if (!utf8::valid(s)) { + if (!utf8::valid(in)) { return utf8::invalid; }; return s; diff --git a/types/c/strings.ha b/types/c/strings.ha @@ -44,8 +44,10 @@ export fn tostr(cstr: *const char) (const str | utf8::invalid) = { // Converts a C string with a given length to a Hare string. If the string is // not valid UTF-8, return [[encoding::utf8::invalid]]. export fn tostrn(cstr: *const char, length: size) (const str | utf8::invalid) = { - let s = tostrn_unsafe(cstr, length); - return if (utf8::valid(s)) s else utf8::invalid; + if (!utf8::valid((cstr: *[*]u8)[..length])) { + return utf8::invalid; + }; + return tostrn_unsafe(cstr, length); }; // Converts a Hare string to a C string. The result is allocated; the caller