hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 513dd8dbfbb955f330211c29616a9d59b3729c9f
parent a04c25428ea6c7c072ce6232e1077999b7262886
Author: Byron Torres <b@torresjrjr.com>
Date:   Sun, 29 Oct 2023 10:57:54 +0000

encoding::utf8: add error type invalidcodepoint

Signed-off-by: Byron Torres <b@torresjrjr.com>

Diffstat:
Mascii/string.ha | 4++--
Mbufio/scanner_test+test.ha | 2+-
Mbufio/stream.ha | 2+-
Mcmd/haredoc/doc/html.ha | 2+-
Mdebug/ident.ha | 2+-
Mencoding/asn1/strings.ha | 6+++---
Mencoding/utf8/encode.ha | 54++++++++++++++++++++++++++++++++++++++++++------------
Mfmt/print.ha | 6+++---
Mhare/lex/lex.ha | 12++++++------
Mmemio/ops.ha | 2+-
Mnet/uri/fmt.ha | 2+-
Mshlex/escape.ha | 2+-
Mstrings/contains.ha | 2+-
Mstrings/index.ha | 4++--
Mstrings/pad.ha | 4++--
Mstrings/runes.ha | 2+-
Mstrings/suffix.ha | 4++--
17 files changed, 71 insertions(+), 41 deletions(-)

diff --git a/ascii/string.ha b/ascii/string.ha @@ -22,7 +22,7 @@ export fn strlower_buf(s: str, buf: []u8) str = { let buf = buf[..0]; let it = strings::iter(s); for (let r => strings::next(&it)) { - static append(buf, utf8::encoderune(tolower(r))...)!; + static append(buf, utf8::encoderune(tolower(r))!...)!; }; return strings::fromutf8(buf)!; }; @@ -44,7 +44,7 @@ export fn strupper_buf(s: str, buf: []u8) str = { let buf = buf[..0]; let it = strings::iter(s); for (let r => strings::next(&it)) { - static append(buf, utf8::encoderune(toupper(r))...)!; + static append(buf, utf8::encoderune(toupper(r))!...)!; }; return strings::fromutf8(buf)!; }; diff --git a/bufio/scanner_test+test.ha b/bufio/scanner_test+test.ha @@ -153,7 +153,7 @@ use types; unread(&scanner, [b]); let b = scan_rune(&scanner) as rune; - unread(&scanner, utf8::encoderune(b)); + unread(&scanner, utf8::encoderune(b)!); let l = scan_line(&scanner)! as const str; assert(l == " I will not repeat "); diff --git a/bufio/stream.ha b/bufio/stream.ha @@ -148,7 +148,7 @@ fn stream_unread(s: *stream, buf: []u8) void = { // Unreads a rune; see [[unread]]. export fn unreadrune(s: io::handle, rn: rune) void = { - const buf = utf8::encoderune(rn); + const buf = utf8::encoderune(rn)!; unread(s, buf); }; diff --git a/cmd/haredoc/doc/html.ha b/cmd/haredoc/doc/html.ha @@ -33,7 +33,7 @@ fn html_escape(out: io::handle, in: str) (size | io::error) = { case '\'' => yield "&apos;"; case => - yield strings::fromutf8(utf8::encoderune(rn))!; + yield strings::fromutf8(utf8::encoderune(rn)!)!; })?; }; return z; diff --git a/debug/ident.ha b/debug/ident.ha @@ -16,7 +16,7 @@ export fn symname_to_ident(name: str) const str = { static append(slice, ':')!; static append(slice, ':')!; } else { - static append(slice, utf8::encoderune(rn)...)!; + static append(slice, utf8::encoderune(rn)!...)!; }; }; diff --git a/encoding/asn1/strings.ha b/encoding/asn1/strings.ha @@ -201,7 +201,7 @@ fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { }; let r = endian::begetu16(rbuf): rune; - let rb = utf8::encoderune(r); + let rb = utf8::encoderune(r)!; if (len(buf) - n < len(rb)) { dataunread(s.d, rbuf); return n; @@ -227,7 +227,7 @@ fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { }; let r = endian::begetu32(rbuf): rune; - let rb = utf8::encoderune(r); + let rb = utf8::encoderune(r)!; if (len(buf) - n < len(rb)) { dataunread(s.d, rbuf); return n; @@ -258,7 +258,7 @@ fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { match (t61_chardecode(in)) { case let r: rune => - let raw = utf8::encoderune(r); + let raw = utf8::encoderune(r)!; const bufremain = len(buf) - n; if (len(raw) < bufremain) { buf[n..n + len(raw)] = raw[..]; diff --git a/encoding/utf8/encode.ha b/encoding/utf8/encode.ha @@ -1,13 +1,21 @@ // SPDX-License-Identifier: MPL-2.0 // (c) Hare authors <https://harelang.org> +// The value of this rune is not a valid Unicode codepoint. +export type invalidcodepoint = !rune; + +fn isvalidcodepoint(cp: u32) bool = { + return (cp < 0xD800 || cp > 0xDFFF) && cp <= 0x10FFFF; +}; + // Encodes a rune as UTF-8 and returns the result as a slice. The return value // is statically allocated, and will not be consistent after subsequent calls to // encoderune. -export fn encoderune(r: rune) []u8 = { +export fn encoderune(r: rune) ([]u8 | invalidcodepoint) = { let ch = r: u32, n = 0z, first = 0u8; - assert((ch < 0xD800 || ch > 0xDFFF) && ch <= 0x10FFFF, - "the rune is not a valid Unicode codepoint"); + if (!isvalidcodepoint(ch)) { + return r: invalidcodepoint; + }; if (ch < 0x80) { first = 0; @@ -33,16 +41,38 @@ export fn encoderune(r: rune) []u8 = { }; @test fn encode() void = { - const expected: [_][]u8 = [ - [0], - [0x25], - [0xE3, 0x81, 0x93], + const testcases: [](rune, bool, []u8) = [ + // input rune + // | expects error + // | | expected encoding + ('\0', + false, [0]), + ('%', + false, [0x25]), + ('こ', + false, [0xE3, 0x81, 0x93]), + (0xD800: rune, + true, []), + (0xDF00: rune, + true, []), + (0x110000: rune, + true, []), ]; - const inputs = ['\0', '%', 'こ']; - for (let i = 0z; i < len(inputs); i += 1) { - const out = encoderune(inputs[i]); - for (let j = 0z; j < len(expected[i]); j += 1) { - assert(out[j] == expected[i][j]); + + for (let i = 0z; i < len(testcases); i += 1) { + const tc = testcases[i]; + const input = tc.0; + const want = tc.2; + + match(encoderune(input)) { + case invalidcodepoint => + assert(tc.1, "want []u8, got invalidcodepoint"); + case let got: []u8 => + assert(!tc.1, "want invalidcodepoint, got []u8"); + for (let j = 0z; j < len(want); j += 1) { + assert(got[j] == want[j], + "[]u8 mismatch"); + }; }; }; }; diff --git a/fmt/print.ha b/fmt/print.ha @@ -63,11 +63,11 @@ fn format( let z = 0z; for (z < start) { - z += io::write(out, utf8::encoderune(mod.pad))?; + z += io::write(out, utf8::encoderune(mod.pad)!)?; }; z += format_raw(out, arg, mod)?; for (z < mod.width) { - z += io::write(out, utf8::encoderune(mod.pad))?; + z += io::write(out, utf8::encoderune(mod.pad)!)?; }; return z; @@ -81,7 +81,7 @@ fn format_raw( case void => return io::write(out, strings::toutf8("void")); case let r: rune => - return io::write(out, utf8::encoderune(r)); + return io::write(out, utf8::encoderune(r)!); case let s: str => if (mod.prec > 0 && mod.prec < len(s)) { s = strings::sub(s, 0, mod.prec); diff --git a/hare/lex/lex.ha b/hare/lex/lex.ha @@ -407,7 +407,7 @@ fn lex_literal(lex: *lexer) (token | error) = { let started = false; let base = strconv::base::DEC; if (r.0 == '0') { - append(chars, utf8::encoderune(r.0)...)!; + append(chars, utf8::encoderune(r.0)!...)!; r = match (next(lex)?) { case io::EOF => return (ltok::LIT_ICONST, 0u64, loc); @@ -487,7 +487,7 @@ fn lex_literal(lex: *lexer) (token | error) = { }; unget(lex, r.0); float = true; - append(chars, utf8::encoderune('.')...)!; + append(chars, utf8::encoderune('.')!...)!; }; case 'e', 'E', 'p', 'P' => if (!started) { @@ -504,7 +504,7 @@ fn lex_literal(lex: *lexer) (token | error) = { break; } else { if (end == 0) end = len(chars); - append(chars, utf8::encoderune(r.0)...)!; + append(chars, utf8::encoderune(r.0)!...)!; exp = len(chars); r = match (next(lex)?) { case io::EOF => @@ -514,7 +514,7 @@ fn lex_literal(lex: *lexer) (token | error) = { }; switch (r.0) { case '+', '-' => - append(chars, utf8::encoderune(r.0)...)!; + append(chars, utf8::encoderune(r.0)!...)!; case => unget(lex, r.0); }; @@ -533,7 +533,7 @@ fn lex_literal(lex: *lexer) (token | error) = { } else { suff = len(chars); if (end == 0) end = len(chars); - append(chars, utf8::encoderune(r.0)...)!; + append(chars, utf8::encoderune(r.0)!...)!; basechrs = "0123456789"; }; case '_' => @@ -556,7 +556,7 @@ fn lex_literal(lex: *lexer) (token | error) = { }; } else { last_rune_was_separator = false; - append(chars, utf8::encoderune(r.0)...)!; + append(chars, utf8::encoderune(r.0)!...)!; }; started = true; }; diff --git a/memio/ops.ha b/memio/ops.ha @@ -131,4 +131,4 @@ export fn rjoin(out: io::handle, delim: str, strs: str...) (size | io::error) = // Appends a rune to a stream. export fn appendrune(out: io::handle, r: rune) (size | io::error) = - io::writeall(out, utf8::encoderune(r)); + io::writeall(out, utf8::encoderune(r)!); diff --git a/net/uri/fmt.ha b/net/uri/fmt.ha @@ -99,7 +99,7 @@ fn percent_encode(out: io::handle, src: str, allowed: str) (size | io::error) = if (ascii::isalnum(r) || strings::contains(allowed, r)) { n += fmt::fprint(out, r)?; } else { - const en = utf8::encoderune(r); + const en = utf8::encoderune(r)!; for (let elem .. en) { n += fmt::fprintf(out, "%{:X}", elem)?; }; diff --git a/shlex/escape.ha b/shlex/escape.ha @@ -38,7 +38,7 @@ export fn quote(sink: io::handle, s: str) (size | io::error) = { if (rn == '\'') { z += io::writeall(sink, strings::toutf8(`'"'"'`))?; } else { - z += io::writeall(sink, utf8::encoderune(rn))?; + z += io::writeall(sink, utf8::encoderune(rn)!)?; }; }; diff --git a/strings/contains.ha b/strings/contains.ha @@ -14,7 +14,7 @@ export fn contains(haystack: str, needles: (str | rune)...) bool = { toutf8(s)); case let r: rune => yield bytes::contains(toutf8(haystack), - utf8::encoderune(r)); + utf8::encoderune(r)!); }; if (matched) { return true; diff --git a/strings/index.ha b/strings/index.ha @@ -129,7 +129,7 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = { case let s: str => yield toutf8(s); case let r: rune => - yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r); + yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r)!; }); }; @@ -140,7 +140,7 @@ export fn rbyteindex(haystack: str, needle: (str | rune)) (size | void) = { case let s: str => yield toutf8(s); case let r: rune => - yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r); + yield if (r: u32 <= 0x7f) r: u8 else utf8::encoderune(r)!; }); }; diff --git a/strings/pad.ha b/strings/pad.ha @@ -11,7 +11,7 @@ export fn lpad(s: str, p: rune, maxlen: size) str = { }; let res: []u8 = alloc([], maxlen)!; for (let i = 0z; i < maxlen - len(s); i += 1) { - append(res, utf8::encoderune(p)...)!; + append(res, utf8::encoderune(p)!...)!; }; append(res, toutf8(s)...)!; return fromutf8_unsafe(res[..maxlen]); @@ -40,7 +40,7 @@ export fn rpad(s: str, p: rune, maxlen: size) str = { let res: []u8 = alloc([], maxlen)!; append(res, toutf8(s)...)!; for (let i = 0z; i < maxlen - len(s); i += 1) { - append(res, utf8::encoderune(p)...)!; + append(res, utf8::encoderune(p)!...)!; }; return fromutf8_unsafe(res[..maxlen]); }; diff --git a/strings/runes.ha b/strings/runes.ha @@ -18,7 +18,7 @@ export fn torunes(s: str) []rune = { export fn fromrunes(runes: []rune) str = { let bytes: []u8 = []; for (let r .. runes) { - const bs = utf8::encoderune(r); + const bs = utf8::encoderune(r)!; append(bytes, bs...)!; }; return fromutf8_unsafe(bytes); diff --git a/strings/suffix.ha b/strings/suffix.ha @@ -8,7 +8,7 @@ use encoding::utf8; export fn hasprefix(in: str, prefix: (str | rune)) bool = { let prefix = match (prefix) { case let r: rune => - yield utf8::encoderune(r); + yield utf8::encoderune(r)!; case let s: str => yield toutf8(s); }; @@ -26,7 +26,7 @@ export fn hasprefix(in: str, prefix: (str | rune)) bool = { export fn hassuffix(in: str, suff: (str | rune)) bool = { let suff = match (suff) { case let r: rune => - yield utf8::encoderune(r); + yield utf8::encoderune(r)!; case let s: str => yield toutf8(s); };