commit d756778d288097017515ddb4e60f6f7c41ed96f9
parent d594cbee291e9b9032f461611c367b44d538d672
Author: Alexey Yerin <yyp@disroot.org>
Date: Thu, 17 Mar 2022 21:44:10 +0300
Implement net::uri
Implements: https://todo.sr.ht/~sircmpwn/hare/329
Signed-off-by: Alexey Yerin <yyp@disroot.org>
Diffstat:
7 files changed, 665 insertions(+), 0 deletions(-)
diff --git a/net/uri/+test.ha b/net/uri/+test.ha
@@ -0,0 +1,121 @@
+use net::ip;
+
+@test fn parse() void = {
+ test_uri(
+ "http://harelang.org/",
+ uri {
+ scheme = "http",
+ host = "harelang.org",
+ path = "/",
+ ...
+ },
+ )!;
+ test_uri(
+ "irc+insecure://chat.sr.ht:6667",
+ uri {
+ scheme = "irc+insecure",
+ host = "chat.sr.ht",
+ port = 6667,
+ ...
+ },
+ )!;
+ test_uri(
+ "ldap://13.37.73.31:1234/",
+ uri {
+ scheme = "ldap",
+ host = [13, 37, 73, 31]: ip::addr4,
+ port = 1234,
+ path = "/",
+ ...
+ },
+ )!;
+ test_uri(
+ "http://[::1]/test",
+ uri {
+ scheme = "http",
+ host = ip::parse("::1")!,
+ path = "/test",
+ ...
+ },
+ )!;
+
+ // Test percent decoding in various places
+ test_uri(
+ "https://git%2esr.ht/~sircmpw%6e/hare#Build%20status",
+ uri {
+ scheme = "https",
+ host = "git.sr.ht",
+ path = "/~sircmpwn/hare",
+ fragment = "Build status",
+ ...
+ },
+ )!;
+
+ // IPv6
+ test_uri(
+ "ldap://[2001:db8::7]/c=GB?objectClass?one",
+ uri {
+ scheme = "ldap",
+ host = ip::parse("2001:db8::7")!,
+ path = "/c=GB",
+ query = "objectClass?one",
+ ...
+ },
+ )!;
+
+ // Some non-URL variants like mailto: or URN
+ test_uri(
+ "urn:example:animal:ferret:nose",
+ uri {
+ scheme = "urn",
+ host = "",
+ path = "example:animal:ferret:nose",
+ ...
+ },
+ )!;
+ test_uri(
+ "mailto:~sircmpwn/hare-dev@lists.sr.ht",
+ uri {
+ scheme = "mailto",
+ host = "",
+ path = "~sircmpwn/hare-dev@lists.sr.ht",
+ ...
+ },
+ )!;
+
+ // https://bugs.chromium.org/p/chromium/issues/detail?id=841105
+ test_uri(
+ "https://web-safety.net/..;@www.google.com:%3443",
+ uri {
+ scheme = "https",
+ host = "web-safety.net",
+ path = "/..;@www.google.com:443",
+ ...
+ },
+ )!;
+};
+
+@test fn invalid() void = {
+ // Scheme
+ assert(parse(":") is invalid);
+ assert(parse("hello*:") is invalid);
+ assert(parse("hello") is invalid);
+};
+
+fn test_uri(in: str, expected: uri) (void | invalid) = {
+ const u = parse(in)?;
+ defer finish(&u);
+
+ assert(u.scheme == expected.scheme);
+ match (u.host) {
+ case let s: str =>
+ assert(s == expected.host as str);
+ case let i: ip::addr =>
+ assert(ip::equal(i, expected.host as ip::addr));
+ };
+ assert(u.port == expected.port);
+ assert(u.userinfo == expected.userinfo);
+ assert(u.path == expected.path);
+ assert(u.query == expected.query);
+ assert(u.fragment == expected.fragment);
+};
diff --git a/net/uri/README b/net/uri/README
@@ -0,0 +1,2 @@
+The net::uri module provides utilities for parsing Uniform Resource Identifiers
+(RFC 3986).
diff --git a/net/uri/fmt.ha b/net/uri/fmt.ha
@@ -0,0 +1,80 @@
+use ascii;
+use encoding::utf8;
+use fmt;
+use io;
+use net::ip;
+use strconv;
+use strings;
+use strio;
+
+// Writes a formatted [[uri]] to an [[io::handle]]. Returns the number of bytes
+// written.
+export fn fmt(out: io::handle, u: *const uri) (size | io::error) = {
+ let n = 0z;
+ let slashes_w = false;
+ n += fmt::fprintf(out, "{}:", u.scheme)?;
+ if (len(u.userinfo) > 0) {
+ assert(!(u.host is str) || len(u.host as str) > 0);
+ n += fmt::fprintf(out, "//{}@", u.userinfo)?;
+ slashes_w = true;
+ };
+ match (u.host) {
+ case let host: str =>
+ if (len(host) > 0) {
+ if (!slashes_w) {
+ n += fmt::fprint(out, "//")?;
+ };
+ n += percent_encode(out, host)?;
+ };
+ case let addr: ip::addr =>
+ if (!slashes_w) {
+ n += fmt::fprint(out, "//")?;
+ };
+ n += ip::fmt(out, addr)?;
+ };
+ if (u.port != 0) {
+ n += fmt::fprintf(out, ":{}")?;
+ };
+ n += fmt::fprint(out, u.path)?;
+ if (len(u.query) > 0) {
+ n += percent_encode(out, u.query)?;
+ };
+ if (len(u.fragment) > 0) {
+ n += fmt::fprint(out, "#")?;
+ n += percent_encode(out, u.fragment)?;
+ };
+
+ return n;
+};
+
+fn percent_encode(out: io::handle, src: str) (size | io::error) = {
+ let iter = strings::iter(src);
+ let n = 0z;
+ for (true) {
+ const r = match (strings::next(&iter)) {
+ case let r: rune =>
+ yield r;
+ case =>
+ break;
+ };
+ // unreserved
+ if (ascii::isalnum(r) || strings::contains("-._~", r)) {
+ n += fmt::fprint(out, r)?;
+ } else {
+ const en = utf8::encoderune(r);
+ for (let i = 0z; i < len(en); i += 1) {
+ n += fmt::fprintf(out, "%{}",
+ strconv::u8tosb(en[i],
+ strconv::base::HEX))?;
+ };
+ };
+ };
+ return n;
+};
+
+// Formats a [[uri]] into a string. The result must be freed by the caller.
+export fn string(u: *const uri) str = {
+ const st = strio::dynamic();
+ fmt(&st, u)!;
+ return strio::string(&st);
+};
diff --git a/net/uri/parse.ha b/net/uri/parse.ha
@@ -0,0 +1,375 @@
+use ascii;
+use io;
+use net::ip;
+use strconv;
+use strings;
+use strio;
+
+// The URI provided to [[parse]] is invalid.
+export type invalid = !void;
+
+// Parses a URI string into [[uri]] structure. The return value must be freed
+// using [[uri_finish]].
+export fn parse(in: str) (uri | invalid) = {
+ let in = strings::iter(in);
+
+ const scheme = parse_scheme(&in)?;
+
+ // Determine hier-part variant
+ let path = "";
+ let authority = ("", 0u16, "");
+ match (strings::next(&in)) {
+ case let r: rune =>
+ switch (r) {
+ case '/' =>
+ // Either "//"+authority+path-abempty or path-absolute
+ switch (wantrune(&in)?) {
+ case '/' =>
+ // "//" + authority + path-abempty
+ authority = parse_authority(&in)?;
+ match (strings::next(&in)) {
+ case let r: rune =>
+ switch (r) {
+ case '?', '#' =>
+ // path-empty
+ strings::prev(&in);
+ case '/' =>
+ // path-absolute
+ strings::prev(&in);
+ path = parse_path(&in,
+ path_mode::ABSOLUTE)?;
+ case =>
+ return invalid;
+ };
+ case => void; // path-empty
+ };
+ case =>
+ // path-absolute
+ strings::prev(&in);
+ path = parse_path(&in, path_mode::ABSOLUTE)?;
+ };
+ case =>
+ // path-rootless
+ strings::prev(&in);
+ path = parse_path(&in, path_mode::ROOTLESS)?;
+ };
+ case => void; // path-empty
+ };
+
+ let query = "";
+ match (strings::next(&in)) {
+ case let r: rune =>
+ if (r == '?') {
+ query = parse_query(&in)?;
+ } else {
+ strings::prev(&in);
+ };
+ case => void;
+ };
+
+ let fragment = "";
+ match (strings::next(&in)) {
+ case let r: rune =>
+ if (r == '#') {
+ fragment = parse_fragment(&in)?;
+ } else {
+ strings::prev(&in);
+ };
+ case => void;
+ };
+
+ return uri {
+ scheme = scheme,
+
+ host = match (ip::parse(authority.0)) {
+ case let a: ip::addr =>
+ yield a;
+ case ip::invalid =>
+ yield authority.0;
+ },
+ port = authority.1,
+ userinfo = authority.2,
+
+ path = path,
+ query = query,
+ fragment = fragment,
+ };
+};
+
+fn parse_scheme(in: *strings::iterator) (str | invalid) = {
+ let buf = strio::dynamic();
+
+ for (let i = 0z; true; i += 1) {
+ const r = wantrune(in)?;
+ if (i > 0 && r == ':') {
+ break;
+ };
+ if (i == 0) {
+ if (!ascii::isalpha(r)) {
+ return invalid;
+ };
+ } else {
+ if (!ascii::isalnum(r) && !strings::contains("+-.", r)) {
+ return invalid;
+ };
+ };
+ strio::appendrune(&buf, r)!;
+ };
+
+ return strio::string(&buf);
+};
+
+fn parse_authority(in: *strings::iterator) ((str, u16, str) | invalid) = {
+ // Scan everything until '@' or ':' or '/', then decide what it is
+ let buf = strio::dynamic();
+ defer io::close(&buf);
+ let host = "";
+ let port = 0u16;
+ let userinfo = "";
+
+ for (true) {
+ const r = match (strings::next(in)) {
+ case let r: rune =>
+ yield r;
+ case void =>
+ break;
+ };
+
+ if (r == '[') {
+ if (len(strio::string(&buf)) > 0) {
+ if (len(userinfo) > 0) {
+ return invalid;
+ } else {
+ userinfo = percent_decode(
+ strio::string(&buf))?;
+ };
+ };
+ strio::reset(&buf);
+
+ for (true) {
+ const r = wantrune(in)?;
+ if (r == ']') {
+ break;
+ };
+ strio::appendrune(&buf, r)!;
+ };
+
+ host = percent_decode(strio::string(&buf))?;
+ } else if (r == ':' || !is_userinfo(r) && !is_host(r)) {
+ if (len(userinfo) > 0 && is_userinfo(r)) {
+ return invalid;
+ };
+
+ if (r == '@') {
+ // This was userinfo+host[+port]
+ userinfo = percent_decode(strio::string(&buf))?;
+ strio::reset(&buf);
+ };
+ if (r == '/') {
+ // This was just host
+ strings::prev(in);
+ host = percent_decode(strio::string(&buf))?;
+ break;
+ };
+ if (r == ':') {
+ // This was host+port
+ host = percent_decode(strio::string(&buf))?;
+ port = parse_port(in)?;
+ break;
+ };
+ } else {
+ strio::appendrune(&buf, r)!;
+ };
+ };
+
+ // In end of string case
+ if (len(host) == 0) {
+ host = percent_decode(strio::string(&buf))?;
+ };
+
+ return (host, port, userinfo);
+};
+
+type path_mode = enum {
+ ABSOLUTE,
+ ROOTLESS,
+};
+
+fn parse_path(in: *strings::iterator, mode: path_mode) (str | invalid) = {
+ let buf = strio::dynamic();
+ defer io::close(&buf);
+
+ // With rootless path, we need at least one segment
+ if (mode == path_mode::ROOTLESS) {
+ for (let i = 0z; true; i += 1) {
+ match (strings::next(in)) {
+ case let r: rune =>
+ if (r == '?' || r == '#') {
+ strings::prev(in);
+ break;
+ };
+ if (r == '/') {
+ if (i == 0) {
+ return invalid;
+ } else {
+ strio::appendrune(&buf, '/')!;
+ break;
+ };
+ };
+ if (!is_pchar(r)) {
+ return invalid;
+ };
+ strio::appendrune(&buf, r)!;
+ case void =>
+ break;
+ };
+ };
+ };
+
+ for (true) {
+ match (strings::next(in)) {
+ case let r: rune =>
+ if (r == '?' || r == '#') {
+ strings::prev(in);
+ break;
+ };
+ if (!is_pchar(r) && r != '/') {
+ return invalid;
+ };
+ strio::appendrune(&buf, r)!;
+ case void =>
+ break;
+ };
+ };
+
+ return percent_decode(strio::string(&buf));
+};
+
+fn parse_query(in: *strings::iterator) (str | invalid) = {
+ let buf = strio::dynamic();
+
+ for (true) {
+ match (strings::next(in)) {
+ case let r: rune =>
+ if (r == '#') {
+ strings::prev(in);
+ break;
+ };
+ if (!is_pchar(r) && r != '/' && r != '?') {
+ return invalid;
+ };
+ strio::appendrune(&buf, r)!;
+ case void =>
+ break;
+ };
+ };
+
+ return strio::string(&buf);
+};
+
+fn parse_fragment(in: *strings::iterator) (str | invalid) = {
+ let buf = strio::dynamic();
+ defer io::close(&buf);
+
+ for (true) {
+ match (strings::next(in)) {
+ case let r: rune =>
+ if (!is_pchar(r) && r != '/' && r != '?') {
+ return invalid;
+ };
+ strio::appendrune(&buf, r)!;
+ case void =>
+ break;
+ };
+ };
+
+ return percent_decode(strio::string(&buf))?;
+};
+
+fn parse_port(in: *strings::iterator) (u16 | invalid) = {
+ let buf = strio::dynamic();
+ defer io::close(&buf);
+ for (true) {
+ const r = match (strings::next(in)) {
+ case let r: rune =>
+ yield r;
+ case void =>
+ break;
+ };
+
+ if (!ascii::isdigit(r)) {
+ strings::prev(in);
+ break;
+ };
+ strio::appendrune(&buf, r)!;
+ };
+
+ match (strconv::stou16(strio::string(&buf))) {
+ case let port: u16 =>
+ if (port == 0) {
+ // There's no port 0
+ return invalid;
+ };
+ return port;
+ case =>
+ return invalid;
+ };
+};
+
+fn percent_decode(s: str) (str | invalid) = {
+ let buf = strio::dynamic();
+ let iter = strings::iter(s);
+ for (true) {
+ match (strings::next(&iter)) {
+ case let r: rune =>
+ if (r == '%') {
+ let tmp = strio::dynamic();
+ defer io::close(&tmp);
+ for (let i = 0z; i < 2; i += 1) {
+ const r = wantrune(&iter)?;
+ strio::appendrune(&tmp, r)!;
+ };
+
+ match (strconv::stou8b(strio::string(&tmp),
+ strconv::base::HEX)) {
+ case let ord: u8 =>
+ strio::appendrune(&buf, ord: u32: rune)!;
+ case =>
+ return invalid;
+ };
+ } else {
+ strio::appendrune(&buf, r)!;
+ };
+ case void =>
+ break;
+ };
+ };
+ return strio::string(&buf);
+};
+
+fn wantrune(iter: *strings::iterator) (rune | invalid) = {
+ match (strings::next(iter)) {
+ case let r: rune =>
+ return r;
+ case =>
+ return invalid;
+ };
+};
+
+fn is_userinfo(r: rune) bool =
+ // unreserved + sub-delim + ":"
+ ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:", r)
+ // %-encoded
+ || r == '%' || ascii::isxdigit(r);
+
+fn is_host(r: rune) bool =
+ // unreserved + sub-delim
+ ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=", r)
+ // %-encoded
+ || r == '%' || ascii::isxdigit(r);
+
+fn is_pchar(r: rune) bool =
+ // unreserved + sub-delim + ":"/"@"
+ ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:@", r)
+ // %-encoded
+ || r == '%' || ascii::isxdigit(r);
diff --git a/net/uri/uri.ha b/net/uri/uri.ha
@@ -0,0 +1,28 @@
+use net::ip;
+
+// Representation of a parsed URI.
+export type uri = struct {
+ scheme: str,
+
+ host: (str | ip::addr),
+ port: u16,
+ userinfo: str,
+
+ path: str,
+ query: str,
+ fragment: str,
+};
+
+// Frees resources associated with a [[uri]].
+export fn finish(u: *uri) void = {
+ free(u.scheme);
+ match (u.host) {
+ case let s: str =>
+ free(s);
+ case => void;
+ };
+ free(u.userinfo);
+ free(u.path);
+ free(u.query);
+ free(u.fragment);
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -933,6 +933,27 @@ net_unix() {
gen_ssa -pfreebsd net::unix net errors os io strings types fmt net::dial
}
+gensrcs_net_uri() {
+ gen_srcs net::uri \
+ fmt.ha \
+ parse.ha \
+ uri.ha \
+ $*
+}
+
+net_uri() {
+ if [ $testing -eq 0 ]
+ then
+ gensrcs_net_uri
+ else
+ gensrcs_net_uri \
+ +test.ha
+ fi
+ gen_ssa net::uri \
+ ascii ip net::ip strconv strings strio
+}
+
+
math_random() {
gen_srcs math::random \
random.ha
@@ -1251,6 +1272,7 @@ net::ip linux freebsd
net::tcp linux freebsd
net::udp linux freebsd
net::unix linux freebsd
+net::uri
os linux freebsd
os::exec linux freebsd
path
diff --git a/stdlib.mk b/stdlib.mk
@@ -510,6 +510,12 @@ stdlib_deps_linux+=$(stdlib_net_unix_linux)
stdlib_net_unix_freebsd=$(HARECACHE)/net/unix/net_unix-freebsd.o
stdlib_deps_freebsd+=$(stdlib_net_unix_freebsd)
+# gen_lib net::uri (any)
+stdlib_net_uri_any=$(HARECACHE)/net/uri/net_uri-any.o
+stdlib_deps_any+=$(stdlib_net_uri_any)
+stdlib_net_uri_linux=$(stdlib_net_uri_any)
+stdlib_net_uri_freebsd=$(stdlib_net_uri_any)
+
# gen_lib os (linux)
stdlib_os_linux=$(HARECACHE)/os/os-linux.o
stdlib_deps_linux+=$(stdlib_os_linux)
@@ -1491,6 +1497,18 @@ $(HARECACHE)/net/unix/net_unix-freebsd.ssa: $(stdlib_net_unix_freebsd_srcs) $(st
@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nnet::unix \
-t$(HARECACHE)/net/unix/net_unix.td $(stdlib_net_unix_freebsd_srcs)
+# net::uri (+any)
+stdlib_net_uri_any_srcs= \
+ $(STDLIB)/net/uri/fmt.ha \
+ $(STDLIB)/net/uri/parse.ha \
+ $(STDLIB)/net/uri/uri.ha
+
+$(HARECACHE)/net/uri/net_uri-any.ssa: $(stdlib_net_uri_any_srcs) $(stdlib_rt) $(stdlib_ascii_$(PLATFORM)) $(stdlib_ip_$(PLATFORM)) $(stdlib_net_ip_$(PLATFORM)) $(stdlib_strconv_$(PLATFORM)) $(stdlib_strings_$(PLATFORM)) $(stdlib_strio_$(PLATFORM))
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(HARECACHE)/net/uri
+ @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nnet::uri \
+ -t$(HARECACHE)/net/uri/net_uri.td $(stdlib_net_uri_any_srcs)
+
# os (+linux)
stdlib_os_linux_srcs= \
$(STDLIB)/os/+linux/environ.ha \
@@ -2339,6 +2357,12 @@ testlib_deps_linux+=$(testlib_net_unix_linux)
testlib_net_unix_freebsd=$(TESTCACHE)/net/unix/net_unix-freebsd.o
testlib_deps_freebsd+=$(testlib_net_unix_freebsd)
+# gen_lib net::uri (any)
+testlib_net_uri_any=$(TESTCACHE)/net/uri/net_uri-any.o
+testlib_deps_any+=$(testlib_net_uri_any)
+testlib_net_uri_linux=$(testlib_net_uri_any)
+testlib_net_uri_freebsd=$(testlib_net_uri_any)
+
# gen_lib os (linux)
testlib_os_linux=$(TESTCACHE)/os/os-linux.o
testlib_deps_linux+=$(testlib_os_linux)
@@ -3358,6 +3382,19 @@ $(TESTCACHE)/net/unix/net_unix-freebsd.ssa: $(testlib_net_unix_freebsd_srcs) $(t
@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nnet::unix \
-t$(TESTCACHE)/net/unix/net_unix.td $(testlib_net_unix_freebsd_srcs)
+# net::uri (+any)
+testlib_net_uri_any_srcs= \
+ $(STDLIB)/net/uri/fmt.ha \
+ $(STDLIB)/net/uri/parse.ha \
+ $(STDLIB)/net/uri/uri.ha \
+ $(STDLIB)/net/uri/+test.ha
+
+$(TESTCACHE)/net/uri/net_uri-any.ssa: $(testlib_net_uri_any_srcs) $(testlib_rt) $(testlib_ascii_$(PLATFORM)) $(testlib_ip_$(PLATFORM)) $(testlib_net_ip_$(PLATFORM)) $(testlib_strconv_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_strio_$(PLATFORM))
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(TESTCACHE)/net/uri
+ @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nnet::uri \
+ -t$(TESTCACHE)/net/uri/net_uri.td $(testlib_net_uri_any_srcs)
+
# os (+linux)
testlib_os_linux_srcs= \
$(STDLIB)/os/+linux/environ.ha \