hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

parse.ha (8361B)


      1 // License: MPL-2.0
      2 // (c) 2022 Alexey Yerin <yyp@disroot.org>
      3 // (c) 2022 Umar Getagazov <umar@handlerug.me>
      4 use ascii;
      5 use encoding::utf8;
      6 use io;
      7 use memio;
      8 use net::ip;
      9 use strconv;
     10 use strings;
     11 
     12 // The URI provided to [[parse]] is invalid.
     13 export type invalid = !void;
     14 
     15 // Parses a URI string into [[uri]] structure. The return value must be freed
     16 // using [[finish]].
     17 export fn parse(in: str) (uri | invalid) = {
     18 	let in = strings::iter(in);
     19 
     20 	const scheme = parse_scheme(&in)?;
     21 
     22 	// Determine hier-part variant
     23 	let path = "";
     24 	let authority: ((str | ip::addr6), u16, str) = ("", 0u16, "");
     25 	match (strings::next(&in)) {
     26 	case let r: rune =>
     27 		switch (r) {
     28 		case '/' =>
     29 			// Either "//"+authority+path-abempty or path-absolute
     30 			switch (wantrune(&in)?) {
     31 			case '/' =>
     32 				// "//" + authority + path-abempty
     33 				authority = parse_authority(&in)?;
     34 				match (strings::next(&in)) {
     35 				case let r: rune =>
     36 					switch (r) {
     37 					case '?', '#' =>
     38 						// path-empty
     39 						strings::prev(&in);
     40 					case '/' =>
     41 						// path-absolute
     42 						strings::prev(&in);
     43 						path = parse_path(&in,
     44 							path_mode::ABSOLUTE)?;
     45 					case =>
     46 						return invalid;
     47 					};
     48 				case => void; // path-empty
     49 				};
     50 			case =>
     51 				// path-absolute
     52 				strings::prev(&in);
     53 				path = parse_path(&in, path_mode::ABSOLUTE)?;
     54 			};
     55 		case =>
     56 			// path-rootless
     57 			strings::prev(&in);
     58 			path = parse_path(&in, path_mode::ROOTLESS)?;
     59 		};
     60 	case => void; // path-empty
     61 	};
     62 
     63 	let query = "";
     64 	match (strings::next(&in)) {
     65 	case let r: rune =>
     66 		if (r == '?') {
     67 			query = parse_query(&in)?;
     68 		} else {
     69 			strings::prev(&in);
     70 		};
     71 	case => void;
     72 	};
     73 
     74 	let fragment = "";
     75 	match (strings::next(&in)) {
     76 	case let r: rune =>
     77 		if (r == '#') {
     78 			fragment = parse_fragment(&in)?;
     79 		} else {
     80 			strings::prev(&in);
     81 		};
     82 	case => void;
     83 	};
     84 
     85 	return uri {
     86 		scheme = scheme,
     87 
     88 		host = match (authority.0) {
     89 		case let ip: ip::addr6 =>
     90 			yield ip;
     91 		case let s: str =>
     92 			yield match (ip::parse(s)) {
     93 			case let a: ip::addr =>
     94 				yield a;
     95 			case ip::invalid =>
     96 				yield s;
     97 			};
     98 		},
     99 		port = authority.1,
    100 		userinfo = authority.2,
    101 
    102 		path = path,
    103 		query = query,
    104 		fragment = fragment,
    105 	};
    106 };
    107 
    108 fn parse_scheme(in: *strings::iterator) (str | invalid) = {
    109 	let copy = *in;
    110 	for (let i = 0z; true; i += 1) {
    111 		const r = wantrune(in)?;
    112 		if (i > 0 && r == ':') {
    113 			strings::prev(in);
    114 			break;
    115 		};
    116 		if (i == 0) {
    117 			if (!ascii::isalpha(r)) {
    118 				return invalid;
    119 			};
    120 		} else {
    121 			if (!ascii::isalnum(r) && !strings::contains("+-.", r)) {
    122 				return invalid;
    123 			};
    124 		};
    125 	};
    126 	let s = strings::dup(strings::slice(&copy, in));
    127 	strings::next(in);
    128 	return s;
    129 };
    130 
    131 fn parse_authority(
    132 	in: *strings::iterator,
    133 ) (((str | ip::addr6), u16, str) | invalid) = {
    134 	// Scan everything until '@' or ':' or '/', then decide what it is
    135 	let buf = memio::dynamic();
    136 	defer io::close(&buf)!;
    137 	let host: (str | ip::addr6) = "";
    138 	let port = 0u16;
    139 	let userinfo = "";
    140 	let has_userinfo = false;
    141 
    142 	for (true) {
    143 		const r = match (strings::next(in)) {
    144 		case let r: rune =>
    145 			yield r;
    146 		case void =>
    147 			break;
    148 		};
    149 
    150 		if (r == '[') {
    151 			if (len(memio::string(&buf)!) > 0) {
    152 				if (len(userinfo) > 0) {
    153 					return invalid;
    154 				} else {
    155 					userinfo = percent_decode(
    156 						memio::string(&buf)!)?;
    157 				};
    158 			};
    159 			memio::reset(&buf);
    160 
    161 			for (true) {
    162 				const r = wantrune(in)?;
    163 				if (r == ']') {
    164 					break;
    165 				};
    166 				memio::appendrune(&buf, r)!;
    167 			};
    168 
    169 			const addr = percent_decode(memio::string(&buf)!)?;
    170 			match (ip::parse(addr)) {
    171 			case let v6: ip::addr6 =>
    172 				host = v6;
    173 			case =>
    174 				return invalid;
    175 			};
    176 		} else if (r == ':' || !is_userinfo(r) && !is_host(r)) {
    177 			switch (r) {
    178 			case '@' =>
    179 				if (has_userinfo) {
    180 					return invalid;
    181 				};
    182 				// This was userinfo+host[+port]
    183 				userinfo = percent_decode(memio::string(&buf)!)?;
    184 				memio::reset(&buf);
    185 				has_userinfo = true;
    186 			case '/' =>
    187 				// This was just host
    188 				strings::prev(in);
    189 				host = percent_decode(memio::string(&buf)!)?;
    190 				break;
    191 			case ':' =>
    192 				// This was host+port
    193 				host = percent_decode(memio::string(&buf)!)?;
    194 				port = parse_port(in)?;
    195 				break;
    196 			case =>
    197 				return invalid;
    198 			};
    199 		} else {
    200 			memio::appendrune(&buf, r)!;
    201 		};
    202 	};
    203 
    204 	match (host) {
    205 	case let s: str =>
    206 		// In end of string case
    207 		if (len(s) == 0) {
    208 			host = percent_decode(memio::string(&buf)!)?;
    209 		};
    210 	case => yield;
    211 	};
    212 
    213 	return (host, port, userinfo);
    214 };
    215 
    216 type path_mode = enum {
    217 	ABSOLUTE,
    218 	ROOTLESS,
    219 };
    220 
    221 fn parse_path(in: *strings::iterator, mode: path_mode) (str | invalid) = {
    222 	let copy = *in;
    223 	// With rootless path, we need at least one segment
    224 	if (mode == path_mode::ROOTLESS) {
    225 		for (let i = 0z; true; i += 1) {
    226 			match (strings::next(in)) {
    227 			case let r: rune =>
    228 				if (r == '?' || r == '#') {
    229 					strings::prev(in);
    230 					break;
    231 				};
    232 				if (r == '/') {
    233 					if (i == 0) {
    234 						return invalid;
    235 					} else {
    236 						break;
    237 					};
    238 				};
    239 				if (!is_pchar(r)) {
    240 					return invalid;
    241 				};
    242 			case void =>
    243 				break;
    244 			};
    245 		};
    246 	};
    247 
    248 	for (true) {
    249 		match (strings::next(in)) {
    250 		case let r: rune =>
    251 			if (r == '?' || r == '#') {
    252 				strings::prev(in);
    253 				break;
    254 			};
    255 			if (!is_pchar(r) && r != '/') {
    256 				return invalid;
    257 			};
    258 		case void =>
    259 			break;
    260 		};
    261 	};
    262 
    263 	return percent_decode(strings::slice(&copy, in));
    264 };
    265 
    266 fn parse_query(in: *strings::iterator) (str | invalid) = {
    267 	let copy = *in;
    268 	for (true) {
    269 		match (strings::next(in)) {
    270 		case let r: rune =>
    271 			if (r == '#') {
    272 				strings::prev(in);
    273 				break;
    274 			};
    275 			if (!is_pchar(r) && r != '/' && r != '?') {
    276 				return invalid;
    277 			};
    278 		case void =>
    279 			break;
    280 		};
    281 	};
    282 	return strings::dup(strings::slice(&copy, in));
    283 };
    284 
    285 fn parse_fragment(in: *strings::iterator) (str | invalid) = {
    286 	let copy = *in;
    287 	for (true) {
    288 		match (strings::next(in)) {
    289 		case let r: rune =>
    290 			if (!is_pchar(r) && r != '/' && r != '?') {
    291 				return invalid;
    292 			};
    293 		case void =>
    294 			break;
    295 		};
    296 	};
    297 
    298 	return percent_decode(strings::slice(&copy, in))?;
    299 };
    300 
    301 fn parse_port(in: *strings::iterator) (u16 | invalid) = {
    302 	let copy = *in;
    303 	for (true) {
    304 		const r = match (strings::next(in)) {
    305 		case let r: rune =>
    306 			yield r;
    307 		case void =>
    308 			break;
    309 		};
    310 
    311 		if (!ascii::isdigit(r)) {
    312 			strings::prev(in);
    313 			break;
    314 		};
    315 	};
    316 
    317 	match (strconv::stou16(strings::slice(&copy, in))) {
    318 	case let port: u16 =>
    319 		if (port == 0) {
    320 			// There's no port 0
    321 			return invalid;
    322 		};
    323 		return port;
    324 	case =>
    325 		return invalid;
    326 	};
    327 };
    328 
    329 fn percent_decode(s: str) (str | invalid) = {
    330 	let buf = memio::dynamic();
    331 	percent_decode_static(&buf, s)?;
    332 	return memio::string(&buf)!;
    333 };
    334 
    335 fn percent_decode_static(out: io::handle, s: str) (void | invalid) = {
    336 	let iter = strings::iter(s);
    337 	let tmp = memio::dynamic();
    338 	defer io::close(&tmp)!;
    339 	let percent_data: []u8 = [];
    340 	for (true) {
    341 		match (strings::next(&iter)) {
    342 		case let r: rune =>
    343 			if (r == '%') {
    344 				memio::reset(&tmp);
    345 				for (let i = 0z; i < 2; i += 1) {
    346 					const r = wantrune(&iter)?;
    347 					memio::appendrune(&tmp, r)!;
    348 				};
    349 
    350 				match (strconv::stou8b(memio::string(&tmp)!,
    351 					strconv::base::HEX)) {
    352 				case let ord: u8 =>
    353 					append(percent_data, ord);
    354 				case =>
    355 					return invalid;
    356 				};
    357 			} else {
    358 				if(len(percent_data) > 0) {
    359 					match(strings::fromutf8(percent_data)) {
    360 					case let stro: str =>
    361 						memio::concat(out, stro)!;
    362 					case utf8::invalid =>
    363 						return invalid;
    364 					};
    365 
    366 					percent_data = [];
    367 				};
    368 
    369 				memio::appendrune(out, r)!;
    370 			};
    371 		case void =>
    372 			if(len(percent_data) > 0) {
    373 				match(strings::fromutf8(percent_data)) {
    374 				case let stro: str =>
    375 					memio::concat(out, stro)!;
    376 				case utf8::invalid =>
    377 					return invalid;
    378 				};
    379 
    380 				percent_data = [];
    381 			};
    382 
    383 			break;
    384 		};
    385 	};
    386 };
    387 
    388 fn wantrune(iter: *strings::iterator) (rune | invalid) = {
    389 	match (strings::next(iter)) {
    390 	case let r: rune =>
    391 		return r;
    392 	case =>
    393 		return invalid;
    394 	};
    395 };
    396 
    397 fn is_userinfo(r: rune) bool =
    398 	// unreserved + sub-delim + ":"
    399 	ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:", r)
    400 	// %-encoded
    401 	|| r == '%' || ascii::isxdigit(r);
    402 
    403 fn is_host(r: rune) bool =
    404 	// unreserved + sub-delim
    405 	ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=", r)
    406 	// %-encoded
    407 	|| r == '%' || ascii::isxdigit(r);
    408 
    409 fn is_pchar(r: rune) bool =
    410 	// unreserved + sub-delim + ":"/"@"
    411 	ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:@", r)
    412 	// %-encoded
    413 	|| r == '%' || ascii::isxdigit(r);