encoding::utf8: add error type invalidcodepoint - hare - [hare] The Hare programming language

commit 513dd8dbfbb955f330211c29616a9d59b3729c9f
parent a04c25428ea6c7c072ce6232e1077999b7262886
Author: Byron Torres <b@torresjrjr.com>
Date:   Sun, 29 Oct 2023 10:57:54 +0000

encoding::utf8: add error type invalidcodepoint

Signed-off-by: Byron Torres <b@torresjrjr.com>

Diffstat:
M ascii/string.ha  | 4 ++--
M bufio/scanner_test+test.ha  | 2 +-
M bufio/stream.ha  | 2 +-
M cmd/haredoc/doc/html.ha  | 2 +-
M debug/ident.ha  | 2 +-
M encoding/asn1/strings.ha  | 6 +++---
M encoding/utf8/encode.ha  | 54 ++++++++++++++++++++++++++++++++++++++++++------------
M fmt/print.ha  | 6 +++---
M hare/lex/lex.ha  | 12 ++++++------
M memio/ops.ha  | 2 +-
M net/uri/fmt.ha  | 2 +-
M shlex/escape.ha  | 2 +-
M strings/contains.ha  | 2 +-
M strings/index.ha  | 4 ++--
M strings/pad.ha  | 4 ++--
M strings/runes.ha  | 2 +-
M strings/suffix.ha  | 4 ++--

17 files changed, 71 insertions(+), 41 deletions(-)
diff --git a/ascii/string.ha b/ascii/string.ha
@@ -22,7 +22,7 @@ export fn strlower_buf(s: str, buf: []u8) str = {
 	let buf = buf[..0];
 	let it = strings::iter(s);
 	for (let r => strings::next(&it)) {
-		static append(buf, utf8::encoderune(tolower(r))...)!;
+		static append(buf, utf8::encoderune(tolower(r))!...)!;
 	};
 	return strings::fromutf8(buf)!;
 };
@@ -44,7 +44,7 @@ export fn strupper_buf(s: str, buf: []u8) str = {
 	let buf = buf[..0];
 	let it = strings::iter(s);
 	for (let r => strings::next(&it)) {
-		static append(buf, utf8::encoderune(toupper(r))...)!;
+		static append(buf, utf8::encoderune(toupper(r))!...)!;
 	};
 	return strings::fromutf8(buf)!;
 };
diff --git a/bufio/scanner_test+test.ha b/bufio/scanner_test+test.ha
@@ -153,7 +153,7 @@ use types;
 	unread(&scanner, [b]);
 
 	let b = scan_rune(&scanner) as rune;
-	unread(&scanner, utf8::encoderune(b));
+	unread(&scanner, utf8::encoderune(b)!);
 
 	let l = scan_line(&scanner)! as const str;
 	assert(l == " I will not repeat  ");
diff --git a/bufio/stream.ha b/bufio/stream.ha
@@ -148,7 +148,7 @@ fn stream_unread(s: *stream, buf: []u8) void = {
 
 // Unreads a rune; see [[unread]].
 export fn unreadrune(s: io::handle, rn: rune) void = {
-	const buf = utf8::encoderune(rn);
+	const buf = utf8::encoderune(rn)!;
 	unread(s, buf);
 };
 
diff --git a/cmd/haredoc/doc/html.ha b/cmd/haredoc/doc/html.ha
@@ -33,7 +33,7 @@ fn html_escape(out: io::handle, in: str) (size | io::error) = {
 		case '\'' =>
 			yield "&apos;";
 		case =>
-			yield strings::fromutf8(utf8::encoderune(rn))!;
+			yield strings::fromutf8(utf8::encoderune(rn)!)!;
 		})?;
 	};
 	return z;
diff --git a/debug/ident.ha b/debug/ident.ha
@@ -16,7 +16,7 @@ export fn symname_to_ident(name: str) const str = {
 			static append(slice, ':')!;
 			static append(slice, ':')!;
 		} else {
-			static append(slice, utf8::encoderune(rn)...)!;
+			static append(slice, utf8::encoderune(rn)!...)!;
 		};
 	};
 
diff --git a/encoding/asn1/strings.ha b/encoding/asn1/strings.ha
@@ -201,7 +201,7 @@ fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
 		};
 
 		let r = endian::begetu16(rbuf): rune;
-		let rb = utf8::encoderune(r);
+		let rb = utf8::encoderune(r)!;
 		if (len(buf) - n < len(rb)) {
 			dataunread(s.d, rbuf);
 			return n;
@@ -227,7 +227,7 @@ fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
 		};
 
 		let r = endian::begetu32(rbuf): rune;
-		let rb = utf8::encoderune(r);
+		let rb = utf8::encoderune(r)!;
 		if (len(buf) - n < len(rb)) {
 			dataunread(s.d, rbuf);
 			return n;
@@ -258,7 +258,7 @@ fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
 
 		match (t61_chardecode(in)) {
 		case let r: rune =>
-			let raw = utf8::encoderune(r);
+			let raw = utf8::encoderune(r)!;
 			const bufremain = len(buf) - n;
 			if (len(raw) < bufremain) {
 				buf[n..n + len(raw)] = raw[..];
diff --git a/encoding/utf8/encode.ha b/encoding/utf8/encode.ha
@@ -1,13 +1,21 @@
 // SPDX-License-Identifier: MPL-2.0
 // (c) Hare authors <https://harelang.org>
 
+// The value of this rune is not a valid Unicode codepoint.
+export type invalidcodepoint = !rune;
+
+fn isvalidcodepoint(cp: u32) bool = {
+	return (cp < 0xD800 || cp > 0xDFFF) && cp <= 0x10FFFF;
+};
+
 // Encodes a rune as UTF-8 and returns the result as a slice. The return value
 // is statically allocated, and will not be consistent after subsequent calls to
 // encoderune.
-export fn encoderune(r: rune) []u8 = {
+export fn encoderune(r: rune) ([]u8 | invalidcodepoint) = {
 	let ch = r: u32, n = 0z, first = 0u8;
-	assert((ch < 0xD800 || ch > 0xDFFF) && ch <= 0x10FFFF,
-		"the rune is not a valid Unicode codepoint");
+	if (!isvalidcodepoint(ch)) {
+		return r: invalidcodepoint;
+	};
 
 	if (ch < 0x80) {
 		first = 0;
@@ -33,16 +41,38 @@ export fn encoderune(r: rune) []u8 = {
 };
 
 @test fn encode() void = {
-	const expected: [_][]u8 = [
-		[0],
-		[0x25],
-		[0xE3, 0x81, 0x93],
+	const testcases: [](rune, bool, []u8) = [
+	//	input rune
+	//	|       expects error
+	//	|       |      expected encoding
+		('\0',
+			false, [0]),
+		('%',
+			false, [0x25]),
+		('こ',
+			false, [0xE3, 0x81, 0x93]),
+		(0xD800: rune,
+			true, []),
+		(0xDF00: rune,
+			true, []),
+		(0x110000: rune,
+			true, []),
 	];
-	const inputs = ['\0', '%', 'こ'];
-	for (let i = 0z; i < len(inputs); i += 1) {
-		const out = encoderune(inputs[i]);
-		for (let j = 0z; j < len(expected[i]); j += 1) {
-			assert(out[j] == expected[i][j]);
+
+	for (let i = 0z; i < len(testcases); i += 1) {
+		const tc    = testcases[i];
+		const input = tc.0;
+		const want  = tc.2;
+
+		match(encoderune(input)) {
+		case invalidcodepoint =>
+			assert(tc.1, "want []u8, got invalidcodepoint");
+		case let got: []u8 =>
+			assert(!tc.1, "want invalidcodepoint, got []u8");
+			for (let j = 0z; j < len(want); j += 1) {
+				assert(got[j] == want[j],
+					"[]u8 mismatch");
+			};
 		};
 	};
 };
diff --git a/fmt/print.ha b/fmt/print.ha
@@ -63,11 +63,11 @@ fn format(
 
 	let z = 0z;
 	for (z < start) {
-		z += io::write(out, utf8::encoderune(mod.pad))?;
+		z += io::write(out, utf8::encoderune(mod.pad)!)?;
 	};
 	z += format_raw(out, arg, mod)?;
 	for (z < mod.width) {
-		z += io::write(out, utf8::encoderune(mod.pad))?;
+		z += io::write(out, utf8::encoderune(mod.pad)!)?;
 	};
 
 	return z;
@@ -81,7 +81,7 @@ fn format_raw(
 case void =>
 	return io::write(out, strings::toutf8("void"));
 case let r: rune =>
-	return io::write(out, utf8::encoderune(r));
+	return io::write(out, utf8::encoderune(r)!);
 case let s: str =>
 	if (mod.prec > 0 && mod.prec < len(s)) {
 		s = strings::sub(s, 0, mod.prec);
diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha
@@ -407,7 +407,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
 	let started = false;
 	let base = strconv::base::DEC;
 	if (r.0 == '0') {
-		append(chars, utf8::encoderune(r.0)...)!;
+		append(chars, utf8::encoderune(r.0)!...)!;
 		r = match (next(lex)?) {
 		case io::EOF =>
 			return (ltok::LIT_ICONST, 0u64, loc);
@@ -487,7 +487,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
 					};
 					unget(lex, r.0);
 					float = true;
-					append(chars, utf8::encoderune('.')...)!;
+					append(chars, utf8::encoderune('.')!...)!;
 				};
 			case 'e', 'E', 'p', 'P' =>
 				if (!started) {
@@ -504,7 +504,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
 					break;
 				} else {
 					if (end == 0) end = len(chars);
-					append(chars, utf8::encoderune(r.0)...)!;
+					append(chars, utf8::encoderune(r.0)!...)!;
 					exp = len(chars);
 					r = match (next(lex)?) {
 					case io::EOF =>
@@ -514,7 +514,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
 					};
 					switch (r.0) {
 					case '+', '-' =>
-						append(chars, utf8::encoderune(r.0)...)!;
+						append(chars, utf8::encoderune(r.0)!...)!;
 					case =>
 						unget(lex, r.0);
 					};
@@ -533,7 +533,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
 				} else {
 					suff = len(chars);
 					if (end == 0) end = len(chars);
-					append(chars, utf8::encoderune(r.0)...)!;
+					append(chars, utf8::encoderune(r.0)!...)!;
 					basechrs = "0123456789";
 				};
 			case '_' =>
@@ -556,7 +556,7 @@ fn lex_literal(lex: *lexer) (token | error) = {
 			};
 		} else {
 			last_rune_was_separator = false;
-			append(chars, utf8::encoderune(r.0)...)!;
+			append(chars, utf8::encoderune(r.0)!...)!;
 		};
 		started = true;
 	};
diff --git a/memio/ops.ha b/memio/ops.ha
@@ -131,4 +131,4 @@ export fn rjoin(out: io::handle, delim: str, strs: str...) (size | io::error) = 
 
 // Appends a rune to a stream.
 export fn appendrune(out: io::handle, r: rune) (size | io::error) =
-	io::writeall(out, utf8::encoderune(r));
+	io::writeall(out, utf8::encoderune(r)!);
diff --git a/net/uri/fmt.ha b/net/uri/fmt.ha
@@ -99,7 +99,7 @@ fn percent_encode(out: io::handle, src: str, allowed: str) (size | io::error) = 
 		if (ascii::isalnum(r) || strings::contains(allowed, r)) {
 			n += fmt::fprint(out, r)?;
 		} else {
-			const en = utf8::encoderune(r);
+			const en = utf8::encoderune(r)!;
 			for (let elem .. en) {
 				n += fmt::fprintf(out, "%{:X}", elem)?;
 			};
diff --git a/shlex/escape.ha b/shlex/escape.ha
@@ -38,7 +38,7 @@ export fn quote(sink: io::handle, s: str) (size | io::error) = {
 		if (rn == '\'') {
 			z += io::writeall(sink, strings::toutf8(`'"'"'`))?;
 		} else {
-			z += io::writeall(sink, utf8::encoderune(rn))?;
+			z += io::writeall(sink, utf8::encoderune(rn)!)?;
 		};
 	};
 
diff --git a/strings/contains.ha b/strings/contains.ha
@@ -14,7 +14,7 @@ export fn contains(haystack: str, needles: (str | rune)...) bool = {
 				toutf8(s));
 		case let r: rune =>
 			yield bytes::contains(toutf8(haystack),
-				utf8::encoderune(r));
+				utf8::encoderune(r)!);
 		};
 		if (matched) {
 			return true;
diff --git a/strings/index.ha b/strings/index.ha
@@ -129,7 +129,7 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = {
 	case let s: str =>
 		yield toutf8(s);
 	case let r: rune =>
-		yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r);
+		yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r)!;
 	});
 };
 
@@ -140,7 +140,7 @@ export fn rbyteindex(haystack: str, needle: (str | rune)) (size | void) = {
 	case let s: str =>
 		yield toutf8(s);
 	case let r: rune =>
-		yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r);
+		yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r)!;
 	});
 };
 
diff --git a/strings/pad.ha b/strings/pad.ha
@@ -11,7 +11,7 @@ export fn lpad(s: str, p: rune, maxlen: size) str = {
 	};
 	let res: []u8 = alloc([], maxlen)!;
 	for (let i = 0z; i < maxlen - len(s); i += 1) {
-		append(res, utf8::encoderune(p)...)!;
+		append(res, utf8::encoderune(p)!...)!;
 	};
 	append(res, toutf8(s)...)!;
 	return fromutf8_unsafe(res[..maxlen]);
@@ -40,7 +40,7 @@ export fn rpad(s: str, p: rune, maxlen: size) str = {
 	let res: []u8 = alloc([], maxlen)!;
 	append(res, toutf8(s)...)!;
 	for (let i = 0z; i < maxlen - len(s); i += 1) {
-		append(res, utf8::encoderune(p)...)!;
+		append(res, utf8::encoderune(p)!...)!;
 	};
 	return fromutf8_unsafe(res[..maxlen]);
 };
diff --git a/strings/runes.ha b/strings/runes.ha
@@ -18,7 +18,7 @@ export fn torunes(s: str) []rune = {
 export fn fromrunes(runes: []rune) str = {
 	let bytes: []u8 = [];
 	for (let r .. runes) {
-		const bs = utf8::encoderune(r);
+		const bs = utf8::encoderune(r)!;
 		append(bytes, bs...)!;
 	};
 	return fromutf8_unsafe(bytes);
diff --git a/strings/suffix.ha b/strings/suffix.ha
@@ -8,7 +8,7 @@ use encoding::utf8;
 export fn hasprefix(in: str, prefix: (str | rune)) bool = {
 	let prefix = match (prefix) {
 	case let r: rune =>
-		yield utf8::encoderune(r);
+		yield utf8::encoderune(r)!;
 	case let s: str =>
 		yield toutf8(s);
 	};
@@ -26,7 +26,7 @@ export fn hasprefix(in: str, prefix: (str | rune)) bool = {
 export fn hassuffix(in: str, suff: (str | rune)) bool = {
 	let suff = match (suff) {
 	case let r: rune =>
-		yield utf8::encoderune(r);
+		yield utf8::encoderune(r)!;
 	case let s: str =>
 		yield toutf8(s);
 	};

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	ascii/string.ha	\|	4	++--
M	bufio/scanner_test+test.ha	\|	2	+-
M	bufio/stream.ha	\|	2	+-
M	cmd/haredoc/doc/html.ha	\|	2	+-
M	debug/ident.ha	\|	2	+-
M	encoding/asn1/strings.ha	\|	6	+++---
M	encoding/utf8/encode.ha	\|	54	++++++++++++++++++++++++++++++++++++++++++------------
M	fmt/print.ha	\|	6	+++---
M	hare/lex/lex.ha	\|	12	++++++------
M	memio/ops.ha	\|	2	+-
M	net/uri/fmt.ha	\|	2	+-
M	shlex/escape.ha	\|	2	+-
M	strings/contains.ha	\|	2	+-
M	strings/index.ha	\|	4	++--
M	strings/pad.ha	\|	4	++--
M	strings/runes.ha	\|	2	+-
M	strings/suffix.ha	\|	4	++--