hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

parse.ha (8749B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use ascii;
      5 use encoding::utf8;
      6 use io;
      7 use memio;
      8 use net::ip;
      9 use strconv;
     10 use strings;
     11 
     12 // The URI provided to [[parse]] is invalid.
     13 export type invalid = !void;
     14 
     15 // Parses a URI string into [[uri]] structure. The return value must be freed
     16 // using [[finish]].
     17 export fn parse(in: str) (uri | invalid) = {
     18 	let success = false;
     19 	let in = strings::iter(in);
     20 
     21 	const scheme = parse_scheme(&in)?;
     22 	defer if (!success) free(scheme);
     23 
     24 	// Determine hier-part variant
     25 	let path = "";
     26 	let authority: ((str | ip::addr6), u16, str) = ("", 0u16, "");
     27 	defer if (!success) {
     28 		free(path);
     29 		free_host(authority.0);
     30 		free(authority.2);
     31 	};
     32 
     33 	match (strings::next(&in)) {
     34 	case let r: rune =>
     35 		switch (r) {
     36 		case '/' =>
     37 			// Either "//"+authority+path-abempty or path-absolute
     38 			match (strings::next(&in)) {
     39 			case let r: rune =>
     40 				switch(r) {
     41 				case '/' =>
     42 					// "//" + authority + path-abempty
     43 					authority = parse_authority(&in)?;
     44 					match (strings::next(&in)) {
     45 					case let r: rune =>
     46 						switch (r) {
     47 						case '?', '#' =>
     48 							// path-empty
     49 							strings::prev(&in);
     50 						case '/' =>
     51 							// path-absolute
     52 							strings::prev(&in);
     53 							path = parse_path(&in,
     54 								path_mode::ABSOLUTE)?;
     55 						case =>
     56 							return invalid;
     57 						};
     58 					case => void; // path-empty
     59 					};
     60 				case =>
     61 					// path-absolute
     62 					strings::prev(&in); // return current token
     63 					strings::prev(&in); // return leading slash
     64 					path = parse_path(&in, path_mode::ABSOLUTE)?;
     65 				};
     66 			case =>
     67 				// path-absolute (just '/')
     68 				strings::prev(&in); // return leading slash
     69 				path = parse_path(&in, path_mode::ABSOLUTE)?;
     70 			};
     71 		case =>
     72 			// path-rootless
     73 			strings::prev(&in);
     74 			path = parse_path(&in, path_mode::ROOTLESS)?;
     75 		};
     76 	case => void; // path-empty
     77 	};
     78 
     79 	let query = "";
     80 	defer if (!success) free(query);
     81 	match (strings::next(&in)) {
     82 	case let r: rune =>
     83 		if (r == '?') {
     84 			query = parse_query(&in)?;
     85 		} else {
     86 			strings::prev(&in);
     87 		};
     88 	case => void;
     89 	};
     90 
     91 	let fragment = "";
     92 	match (strings::next(&in)) {
     93 	case let r: rune =>
     94 		if (r == '#') {
     95 			fragment = parse_fragment(&in)?;
     96 		} else {
     97 			strings::prev(&in);
     98 		};
     99 	case => void;
    100 	};
    101 
    102 	success = true;
    103 	return uri {
    104 		scheme = scheme,
    105 
    106 		host = match (authority.0) {
    107 		case let ip: ip::addr6 =>
    108 			yield ip;
    109 		case let s: str =>
    110 			yield match (ip::parse(s)) {
    111 			case let a: ip::addr =>
    112 				free(s);
    113 				yield a;
    114 			case ip::invalid =>
    115 				yield s;
    116 			};
    117 		},
    118 		port = authority.1,
    119 		userinfo = authority.2,
    120 
    121 		path = path,
    122 		query = query,
    123 		fragment = fragment,
    124 	};
    125 };
    126 
    127 fn parse_scheme(in: *strings::iterator) (str | invalid) = {
    128 	let copy = *in;
    129 	for (let i = 0z; true; i += 1) {
    130 		const r = wantrune(in)?;
    131 		if (i > 0 && r == ':') {
    132 			strings::prev(in);
    133 			break;
    134 		};
    135 		if (i == 0) {
    136 			if (!ascii::isalpha(r)) {
    137 				return invalid;
    138 			};
    139 		} else {
    140 			if (!ascii::isalnum(r) && !strings::contains("+-.", r)) {
    141 				return invalid;
    142 			};
    143 		};
    144 	};
    145 	let s = strings::dup(strings::slice(&copy, in));
    146 	strings::next(in);
    147 	return s;
    148 };
    149 
    150 fn parse_authority(
    151 	in: *strings::iterator,
    152 ) (((str | ip::addr6), u16, str) | invalid) = {
    153 	// Scan everything until '@' or ':' or '/', then decide what it is
    154 	let success = false;
    155 	let buf = memio::dynamic();
    156 	defer io::close(&buf)!;
    157 	let host: (str | ip::addr6) = "";
    158 	let port = 0u16;
    159 	let userinfo = "";
    160 	let has_userinfo = false;
    161 	defer if (!success) {
    162 		free_host(host);
    163 		free(userinfo);
    164 	};
    165 
    166 	for (let r => strings::next(in)) {
    167 		if (r == '[') {
    168 			if (len(memio::string(&buf)!) > 0) {
    169 				if (len(userinfo) > 0) {
    170 					return invalid;
    171 				} else {
    172 					userinfo = percent_decode(
    173 						memio::string(&buf)!)?;
    174 				};
    175 			};
    176 			memio::reset(&buf);
    177 
    178 			for (true) {
    179 				const r = wantrune(in)?;
    180 				if (r == ']') {
    181 					break;
    182 				};
    183 				memio::appendrune(&buf, r)!;
    184 			};
    185 
    186 			const addr = percent_decode(memio::string(&buf)!)?;
    187 			defer free(addr);
    188 			match (ip::parse(addr)) {
    189 			case let v6: ip::addr6 =>
    190 				host = v6;
    191 			case =>
    192 				return invalid;
    193 			};
    194 		} else if (r == ':' || !is_userinfo(r) && !is_host(r)) {
    195 			switch (r) {
    196 			case '@' =>
    197 				if (has_userinfo) {
    198 					return invalid;
    199 				};
    200 				// This was userinfo+host[+port]
    201 				userinfo = percent_decode(memio::string(&buf)!)?;
    202 				memio::reset(&buf);
    203 				has_userinfo = true;
    204 			case '/' =>
    205 				// This was just host
    206 				strings::prev(in);
    207 				host = percent_decode(memio::string(&buf)!)?;
    208 				break;
    209 			case ':' =>
    210 				// This was host+port
    211 				host = percent_decode(memio::string(&buf)!)?;
    212 				port = parse_port(in)?;
    213 				break;
    214 			case =>
    215 				return invalid;
    216 			};
    217 		} else {
    218 			memio::appendrune(&buf, r)!;
    219 		};
    220 	};
    221 
    222 	match (host) {
    223 	case let s: str =>
    224 		// In end of string case
    225 		if (len(s) == 0) {
    226 			host = percent_decode(memio::string(&buf)!)?;
    227 		};
    228 	case => void;
    229 	};
    230 
    231 	success = true;
    232 	return (host, port, userinfo);
    233 };
    234 
    235 type path_mode = enum {
    236 	ABSOLUTE,
    237 	ROOTLESS,
    238 };
    239 
    240 fn parse_path(in: *strings::iterator, mode: path_mode) (str | invalid) = {
    241 	let copy = *in;
    242 	// With rootless path, we need at least one segment
    243 	if (mode == path_mode::ROOTLESS) {
    244 		for (let i = 0z; true; i += 1) {
    245 			match (strings::next(in)) {
    246 			case let r: rune =>
    247 				if (r == '?' || r == '#') {
    248 					strings::prev(in);
    249 					break;
    250 				};
    251 				if (r == '/') {
    252 					if (i == 0) {
    253 						return invalid;
    254 					} else {
    255 						break;
    256 					};
    257 				};
    258 				if (!is_pchar(r)) {
    259 					return invalid;
    260 				};
    261 			case done =>
    262 				break;
    263 			};
    264 		};
    265 	};
    266 
    267 	for (let r => strings::next(in)) {
    268 		if (r == '?' || r == '#') {
    269 			strings::prev(in);
    270 			break;
    271 		};
    272 		if (!is_pchar(r) && r != '/') {
    273 			return invalid;
    274 		};
    275 	};
    276 
    277 	return percent_decode(strings::slice(&copy, in));
    278 };
    279 
    280 fn parse_query(in: *strings::iterator) (str | invalid) = {
    281 	let copy = *in;
    282 	for (let r => strings::next(in)) {
    283 		if (r == '#') {
    284 			strings::prev(in);
    285 			break;
    286 		};
    287 		if (!is_pchar(r) && r != '/' && r != '?') {
    288 			return invalid;
    289 		};
    290 	};
    291 	return strings::dup(strings::slice(&copy, in));
    292 };
    293 
    294 fn parse_fragment(in: *strings::iterator) (str | invalid) = {
    295 	let copy = *in;
    296 	for (let r => strings::next(in)) {
    297 		if (!is_pchar(r) && r != '/' && r != '?') {
    298 			return invalid;
    299 		};
    300 	};
    301 
    302 	return percent_decode(strings::slice(&copy, in))?;
    303 };
    304 
    305 fn parse_port(in: *strings::iterator) (u16 | invalid) = {
    306 	let copy = *in;
    307 	for (let r => strings::next(in)) {
    308 		if (!ascii::isdigit(r)) {
    309 			strings::prev(in);
    310 			break;
    311 		};
    312 	};
    313 
    314 	match (strconv::stou16(strings::slice(&copy, in))) {
    315 	case let port: u16 =>
    316 		if (port == 0) {
    317 			// There's no port 0
    318 			return invalid;
    319 		};
    320 		return port;
    321 	case =>
    322 		return invalid;
    323 	};
    324 };
    325 
    326 fn percent_decode(s: str) (str | invalid) = {
    327 	let buf = memio::dynamic();
    328 	percent_decode_static(&buf, s)?;
    329 	return memio::string(&buf)!;
    330 };
    331 
    332 fn percent_decode_static(out: io::handle, s: str) (void | invalid) = {
    333 	let iter = strings::iter(s);
    334 	let tmp = memio::dynamic();
    335 	defer io::close(&tmp)!;
    336 	let percent_data: []u8 = [];
    337 	for (true) {
    338 		match (strings::next(&iter)) {
    339 		case let r: rune =>
    340 			if (r == '%') {
    341 				memio::reset(&tmp);
    342 				for (let i = 0z; i < 2; i += 1) {
    343 					const r = wantrune(&iter)?;
    344 					memio::appendrune(&tmp, r)!;
    345 				};
    346 
    347 				match (strconv::stou8(memio::string(&tmp)!,
    348 					strconv::base::HEX)) {
    349 				case let ord: u8 =>
    350 					append(percent_data, ord)!;
    351 				case =>
    352 					return invalid;
    353 				};
    354 			} else {
    355 				if(len(percent_data) > 0) {
    356 					match(strings::fromutf8(percent_data)) {
    357 					case let stro: str =>
    358 						memio::concat(out, stro)!;
    359 					case utf8::invalid =>
    360 						return invalid;
    361 					};
    362 
    363 					free(percent_data);
    364 					percent_data = [];
    365 				};
    366 
    367 				memio::appendrune(out, r)!;
    368 			};
    369 		case done =>
    370 			if(len(percent_data) > 0) {
    371 				match(strings::fromutf8(percent_data)) {
    372 				case let stro: str =>
    373 					memio::concat(out, stro)!;
    374 				case utf8::invalid =>
    375 					return invalid;
    376 				};
    377 
    378 				free(percent_data);
    379 				percent_data = [];
    380 			};
    381 
    382 			break;
    383 		};
    384 	};
    385 };
    386 
    387 fn wantrune(iter: *strings::iterator) (rune | invalid) = {
    388 	match (strings::next(iter)) {
    389 	case let r: rune =>
    390 		return r;
    391 	case =>
    392 		return invalid;
    393 	};
    394 };
    395 
    396 fn free_host(in: (str | ip::addr6)) void = {
    397 	match (in) {
    398 	case let s: str =>
    399 		free(s);
    400 	case => void;
    401 	};
    402 };
    403 
    404 fn is_userinfo(r: rune) bool =
    405 	// unreserved + sub-delim + ":"
    406 	ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:", r)
    407 	// %-encoded
    408 	|| r == '%' || ascii::isxdigit(r);
    409 
    410 fn is_host(r: rune) bool =
    411 	// unreserved + sub-delim
    412 	ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=", r)
    413 	// %-encoded
    414 	|| r == '%' || ascii::isxdigit(r);
    415 
    416 fn is_pchar(r: rune) bool =
    417 	// unreserved + sub-delim + ":"/"@"
    418 	ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:@", r)
    419 	// %-encoded
    420 	|| r == '%' || ascii::isxdigit(r);