commit 513dd8dbfbb955f330211c29616a9d59b3729c9f
parent a04c25428ea6c7c072ce6232e1077999b7262886
Author: Byron Torres <b@torresjrjr.com>
Date: Sun, 29 Oct 2023 10:57:54 +0000
encoding::utf8: add error type invalidcodepoint
Signed-off-by: Byron Torres <b@torresjrjr.com>
Diffstat:
17 files changed, 71 insertions(+), 41 deletions(-)
diff --git a/ascii/string.ha b/ascii/string.ha
@@ -22,7 +22,7 @@ export fn strlower_buf(s: str, buf: []u8) str = {
let buf = buf[..0];
let it = strings::iter(s);
for (let r => strings::next(&it)) {
- static append(buf, utf8::encoderune(tolower(r))...)!;
+ static append(buf, utf8::encoderune(tolower(r))!...)!;
};
return strings::fromutf8(buf)!;
};
@@ -44,7 +44,7 @@ export fn strupper_buf(s: str, buf: []u8) str = {
let buf = buf[..0];
let it = strings::iter(s);
for (let r => strings::next(&it)) {
- static append(buf, utf8::encoderune(toupper(r))...)!;
+ static append(buf, utf8::encoderune(toupper(r))!...)!;
};
return strings::fromutf8(buf)!;
};
diff --git a/bufio/scanner_test+test.ha b/bufio/scanner_test+test.ha
@@ -153,7 +153,7 @@ use types;
unread(&scanner, [b]);
let b = scan_rune(&scanner) as rune;
- unread(&scanner, utf8::encoderune(b));
+ unread(&scanner, utf8::encoderune(b)!);
let l = scan_line(&scanner)! as const str;
assert(l == " I will not repeat ");
diff --git a/bufio/stream.ha b/bufio/stream.ha
@@ -148,7 +148,7 @@ fn stream_unread(s: *stream, buf: []u8) void = {
// Unreads a rune; see [[unread]].
export fn unreadrune(s: io::handle, rn: rune) void = {
- const buf = utf8::encoderune(rn);
+ const buf = utf8::encoderune(rn)!;
unread(s, buf);
};
diff --git a/cmd/haredoc/doc/html.ha b/cmd/haredoc/doc/html.ha
@@ -33,7 +33,7 @@ fn html_escape(out: io::handle, in: str) (size | io::error) = {
case '\'' =>
yield "'";
case =>
- yield strings::fromutf8(utf8::encoderune(rn))!;
+ yield strings::fromutf8(utf8::encoderune(rn)!)!;
})?;
};
return z;
diff --git a/debug/ident.ha b/debug/ident.ha
@@ -16,7 +16,7 @@ export fn symname_to_ident(name: str) const str = {
static append(slice, ':')!;
static append(slice, ':')!;
} else {
- static append(slice, utf8::encoderune(rn)...)!;
+ static append(slice, utf8::encoderune(rn)!...)!;
};
};
diff --git a/encoding/asn1/strings.ha b/encoding/asn1/strings.ha
@@ -201,7 +201,7 @@ fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
};
let r = endian::begetu16(rbuf): rune;
- let rb = utf8::encoderune(r);
+ let rb = utf8::encoderune(r)!;
if (len(buf) - n < len(rb)) {
dataunread(s.d, rbuf);
return n;
@@ -227,7 +227,7 @@ fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
};
let r = endian::begetu32(rbuf): rune;
- let rb = utf8::encoderune(r);
+ let rb = utf8::encoderune(r)!;
if (len(buf) - n < len(rb)) {
dataunread(s.d, rbuf);
return n;
@@ -258,7 +258,7 @@ fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
match (t61_chardecode(in)) {
case let r: rune =>
- let raw = utf8::encoderune(r);
+ let raw = utf8::encoderune(r)!;
const bufremain = len(buf) - n;
if (len(raw) < bufremain) {
buf[n..n + len(raw)] = raw[..];
diff --git a/encoding/utf8/encode.ha b/encoding/utf8/encode.ha
@@ -1,13 +1,21 @@
// SPDX-License-Identifier: MPL-2.0
// (c) Hare authors <https://harelang.org>
+// The value of this rune is not a valid Unicode codepoint.
+export type invalidcodepoint = !rune;
+
+fn isvalidcodepoint(cp: u32) bool = {
+ return (cp < 0xD800 || cp > 0xDFFF) && cp <= 0x10FFFF;
+};
+
// Encodes a rune as UTF-8 and returns the result as a slice. The return value
// is statically allocated, and will not be consistent after subsequent calls to
// encoderune.
-export fn encoderune(r: rune) []u8 = {
+export fn encoderune(r: rune) ([]u8 | invalidcodepoint) = {
let ch = r: u32, n = 0z, first = 0u8;
- assert((ch < 0xD800 || ch > 0xDFFF) && ch <= 0x10FFFF,
- "the rune is not a valid Unicode codepoint");
+ if (!isvalidcodepoint(ch)) {
+ return r: invalidcodepoint;
+ };
if (ch < 0x80) {
first = 0;
@@ -33,16 +41,38 @@ export fn encoderune(r: rune) []u8 = {
};
@test fn encode() void = {
- const expected: [_][]u8 = [
- [0],
- [0x25],
- [0xE3, 0x81, 0x93],
+ const testcases: [](rune, bool, []u8) = [
+ // input rune
+ // | expects error
+ // | | expected encoding
+ ('\0',
+ false, [0]),
+ ('%',
+ false, [0x25]),
+ ('こ',
+ false, [0xE3, 0x81, 0x93]),
+ (0xD800: rune,
+ true, []),
+ (0xDF00: rune,
+ true, []),
+ (0x110000: rune,
+ true, []),
];
- const inputs = ['\0', '%', 'こ'];
- for (let i = 0z; i < len(inputs); i += 1) {
- const out = encoderune(inputs[i]);
- for (let j = 0z; j < len(expected[i]); j += 1) {
- assert(out[j] == expected[i][j]);
+
+ for (let i = 0z; i < len(testcases); i += 1) {
+ const tc = testcases[i];
+ const input = tc.0;
+ const want = tc.2;
+
+ match(encoderune(input)) {
+ case invalidcodepoint =>
+ assert(tc.1, "want []u8, got invalidcodepoint");
+ case let got: []u8 =>
+ assert(!tc.1, "want invalidcodepoint, got []u8");
+ for (let j = 0z; j < len(want); j += 1) {
+ assert(got[j] == want[j],
+ "[]u8 mismatch");
+ };
};
};
};
diff --git a/fmt/print.ha b/fmt/print.ha
@@ -63,11 +63,11 @@ fn format(
let z = 0z;
for (z < start) {
- z += io::write(out, utf8::encoderune(mod.pad))?;
+ z += io::write(out, utf8::encoderune(mod.pad)!)?;
};
z += format_raw(out, arg, mod)?;
for (z < mod.width) {
- z += io::write(out, utf8::encoderune(mod.pad))?;
+ z += io::write(out, utf8::encoderune(mod.pad)!)?;
};
return z;
@@ -81,7 +81,7 @@ fn format_raw(
case void =>
return io::write(out, strings::toutf8("void"));
case let r: rune =>
- return io::write(out, utf8::encoderune(r));
+ return io::write(out, utf8::encoderune(r)!);
case let s: str =>
if (mod.prec > 0 && mod.prec < len(s)) {
s = strings::sub(s, 0, mod.prec);
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -407,7 +407,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
let started = false;
let base = strconv::base::DEC;
if (r.0 == '0') {
- append(chars, utf8::encoderune(r.0)...)!;
+ append(chars, utf8::encoderune(r.0)!...)!;
r = match (next(lex)?) {
case io::EOF =>
return (ltok::LIT_ICONST, 0u64, loc);
@@ -487,7 +487,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
};
unget(lex, r.0);
float = true;
- append(chars, utf8::encoderune('.')...)!;
+ append(chars, utf8::encoderune('.')!...)!;
};
case 'e', 'E', 'p', 'P' =>
if (!started) {
@@ -504,7 +504,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
break;
} else {
if (end == 0) end = len(chars);
- append(chars, utf8::encoderune(r.0)...)!;
+ append(chars, utf8::encoderune(r.0)!...)!;
exp = len(chars);
r = match (next(lex)?) {
case io::EOF =>
@@ -514,7 +514,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
};
switch (r.0) {
case '+', '-' =>
- append(chars, utf8::encoderune(r.0)...)!;
+ append(chars, utf8::encoderune(r.0)!...)!;
case =>
unget(lex, r.0);
};
@@ -533,7 +533,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
} else {
suff = len(chars);
if (end == 0) end = len(chars);
- append(chars, utf8::encoderune(r.0)...)!;
+ append(chars, utf8::encoderune(r.0)!...)!;
basechrs = "0123456789";
};
case '_' =>
@@ -556,7 +556,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
};
} else {
last_rune_was_separator = false;
- append(chars, utf8::encoderune(r.0)...)!;
+ append(chars, utf8::encoderune(r.0)!...)!;
};
started = true;
};
diff --git a/memio/ops.ha b/memio/ops.ha
@@ -131,4 +131,4 @@ export fn rjoin(out: io::handle, delim: str, strs: str...) (size | io::error) =
// Appends a rune to a stream.
export fn appendrune(out: io::handle, r: rune) (size | io::error) =
- io::writeall(out, utf8::encoderune(r));
+ io::writeall(out, utf8::encoderune(r)!);
diff --git a/net/uri/fmt.ha b/net/uri/fmt.ha
@@ -99,7 +99,7 @@ fn percent_encode(out: io::handle, src: str, allowed: str) (size | io::error) =
if (ascii::isalnum(r) || strings::contains(allowed, r)) {
n += fmt::fprint(out, r)?;
} else {
- const en = utf8::encoderune(r);
+ const en = utf8::encoderune(r)!;
for (let elem .. en) {
n += fmt::fprintf(out, "%{:X}", elem)?;
};
diff --git a/shlex/escape.ha b/shlex/escape.ha
@@ -38,7 +38,7 @@ export fn quote(sink: io::handle, s: str) (size | io::error) = {
if (rn == '\'') {
z += io::writeall(sink, strings::toutf8(`'"'"'`))?;
} else {
- z += io::writeall(sink, utf8::encoderune(rn))?;
+ z += io::writeall(sink, utf8::encoderune(rn)!)?;
};
};
diff --git a/strings/contains.ha b/strings/contains.ha
@@ -14,7 +14,7 @@ export fn contains(haystack: str, needles: (str | rune)...) bool = {
toutf8(s));
case let r: rune =>
yield bytes::contains(toutf8(haystack),
- utf8::encoderune(r));
+ utf8::encoderune(r)!);
};
if (matched) {
return true;
diff --git a/strings/index.ha b/strings/index.ha
@@ -129,7 +129,7 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = {
case let s: str =>
yield toutf8(s);
case let r: rune =>
- yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r);
+ yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r)!;
});
};
@@ -140,7 +140,7 @@ export fn rbyteindex(haystack: str, needle: (str | rune)) (size | void) = {
case let s: str =>
yield toutf8(s);
case let r: rune =>
- yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r);
+ yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r)!;
});
};
diff --git a/strings/pad.ha b/strings/pad.ha
@@ -11,7 +11,7 @@ export fn lpad(s: str, p: rune, maxlen: size) str = {
};
let res: []u8 = alloc([], maxlen)!;
for (let i = 0z; i < maxlen - len(s); i += 1) {
- append(res, utf8::encoderune(p)...)!;
+ append(res, utf8::encoderune(p)!...)!;
};
append(res, toutf8(s)...)!;
return fromutf8_unsafe(res[..maxlen]);
@@ -40,7 +40,7 @@ export fn rpad(s: str, p: rune, maxlen: size) str = {
let res: []u8 = alloc([], maxlen)!;
append(res, toutf8(s)...)!;
for (let i = 0z; i < maxlen - len(s); i += 1) {
- append(res, utf8::encoderune(p)...)!;
+ append(res, utf8::encoderune(p)!...)!;
};
return fromutf8_unsafe(res[..maxlen]);
};
diff --git a/strings/runes.ha b/strings/runes.ha
@@ -18,7 +18,7 @@ export fn torunes(s: str) []rune = {
export fn fromrunes(runes: []rune) str = {
let bytes: []u8 = [];
for (let r .. runes) {
- const bs = utf8::encoderune(r);
+ const bs = utf8::encoderune(r)!;
append(bytes, bs...)!;
};
return fromutf8_unsafe(bytes);
diff --git a/strings/suffix.ha b/strings/suffix.ha
@@ -8,7 +8,7 @@ use encoding::utf8;
export fn hasprefix(in: str, prefix: (str | rune)) bool = {
let prefix = match (prefix) {
case let r: rune =>
- yield utf8::encoderune(r);
+ yield utf8::encoderune(r)!;
case let s: str =>
yield toutf8(s);
};
@@ -26,7 +26,7 @@ export fn hasprefix(in: str, prefix: (str | rune)) bool = {
export fn hassuffix(in: str, suff: (str | rune)) bool = {
let suff = match (suff) {
case let r: rune =>
- yield utf8::encoderune(r);
+ yield utf8::encoderune(r)!;
case let s: str =>
yield toutf8(s);
};