parse.ha (8361B)
1 // License: MPL-2.0 2 // (c) 2022 Alexey Yerin <yyp@disroot.org> 3 // (c) 2022 Umar Getagazov <umar@handlerug.me> 4 use ascii; 5 use encoding::utf8; 6 use io; 7 use memio; 8 use net::ip; 9 use strconv; 10 use strings; 11 12 // The URI provided to [[parse]] is invalid. 13 export type invalid = !void; 14 15 // Parses a URI string into [[uri]] structure. The return value must be freed 16 // using [[finish]]. 17 export fn parse(in: str) (uri | invalid) = { 18 let in = strings::iter(in); 19 20 const scheme = parse_scheme(&in)?; 21 22 // Determine hier-part variant 23 let path = ""; 24 let authority: ((str | ip::addr6), u16, str) = ("", 0u16, ""); 25 match (strings::next(&in)) { 26 case let r: rune => 27 switch (r) { 28 case '/' => 29 // Either "//"+authority+path-abempty or path-absolute 30 switch (wantrune(&in)?) { 31 case '/' => 32 // "//" + authority + path-abempty 33 authority = parse_authority(&in)?; 34 match (strings::next(&in)) { 35 case let r: rune => 36 switch (r) { 37 case '?', '#' => 38 // path-empty 39 strings::prev(&in); 40 case '/' => 41 // path-absolute 42 strings::prev(&in); 43 path = parse_path(&in, 44 path_mode::ABSOLUTE)?; 45 case => 46 return invalid; 47 }; 48 case => void; // path-empty 49 }; 50 case => 51 // path-absolute 52 strings::prev(&in); 53 path = parse_path(&in, path_mode::ABSOLUTE)?; 54 }; 55 case => 56 // path-rootless 57 strings::prev(&in); 58 path = parse_path(&in, path_mode::ROOTLESS)?; 59 }; 60 case => void; // path-empty 61 }; 62 63 let query = ""; 64 match (strings::next(&in)) { 65 case let r: rune => 66 if (r == '?') { 67 query = parse_query(&in)?; 68 } else { 69 strings::prev(&in); 70 }; 71 case => void; 72 }; 73 74 let fragment = ""; 75 match (strings::next(&in)) { 76 case let r: rune => 77 if (r == '#') { 78 fragment = parse_fragment(&in)?; 79 } else { 80 strings::prev(&in); 81 }; 82 case => void; 83 }; 84 85 return uri { 86 scheme = scheme, 87 88 host = match (authority.0) { 89 case let ip: ip::addr6 => 90 yield ip; 91 case let s: str => 92 yield match (ip::parse(s)) { 93 case let a: ip::addr => 94 yield a; 95 case ip::invalid => 96 yield s; 97 }; 98 }, 99 port = authority.1, 100 userinfo = authority.2, 101 102 path = path, 103 query = query, 104 fragment = fragment, 105 }; 106 }; 107 108 fn parse_scheme(in: *strings::iterator) (str | invalid) = { 109 let copy = *in; 110 for (let i = 0z; true; i += 1) { 111 const r = wantrune(in)?; 112 if (i > 0 && r == ':') { 113 strings::prev(in); 114 break; 115 }; 116 if (i == 0) { 117 if (!ascii::isalpha(r)) { 118 return invalid; 119 }; 120 } else { 121 if (!ascii::isalnum(r) && !strings::contains("+-.", r)) { 122 return invalid; 123 }; 124 }; 125 }; 126 let s = strings::dup(strings::slice(©, in)); 127 strings::next(in); 128 return s; 129 }; 130 131 fn parse_authority( 132 in: *strings::iterator, 133 ) (((str | ip::addr6), u16, str) | invalid) = { 134 // Scan everything until '@' or ':' or '/', then decide what it is 135 let buf = memio::dynamic(); 136 defer io::close(&buf)!; 137 let host: (str | ip::addr6) = ""; 138 let port = 0u16; 139 let userinfo = ""; 140 let has_userinfo = false; 141 142 for (true) { 143 const r = match (strings::next(in)) { 144 case let r: rune => 145 yield r; 146 case void => 147 break; 148 }; 149 150 if (r == '[') { 151 if (len(memio::string(&buf)!) > 0) { 152 if (len(userinfo) > 0) { 153 return invalid; 154 } else { 155 userinfo = percent_decode( 156 memio::string(&buf)!)?; 157 }; 158 }; 159 memio::reset(&buf); 160 161 for (true) { 162 const r = wantrune(in)?; 163 if (r == ']') { 164 break; 165 }; 166 memio::appendrune(&buf, r)!; 167 }; 168 169 const addr = percent_decode(memio::string(&buf)!)?; 170 match (ip::parse(addr)) { 171 case let v6: ip::addr6 => 172 host = v6; 173 case => 174 return invalid; 175 }; 176 } else if (r == ':' || !is_userinfo(r) && !is_host(r)) { 177 switch (r) { 178 case '@' => 179 if (has_userinfo) { 180 return invalid; 181 }; 182 // This was userinfo+host[+port] 183 userinfo = percent_decode(memio::string(&buf)!)?; 184 memio::reset(&buf); 185 has_userinfo = true; 186 case '/' => 187 // This was just host 188 strings::prev(in); 189 host = percent_decode(memio::string(&buf)!)?; 190 break; 191 case ':' => 192 // This was host+port 193 host = percent_decode(memio::string(&buf)!)?; 194 port = parse_port(in)?; 195 break; 196 case => 197 return invalid; 198 }; 199 } else { 200 memio::appendrune(&buf, r)!; 201 }; 202 }; 203 204 match (host) { 205 case let s: str => 206 // In end of string case 207 if (len(s) == 0) { 208 host = percent_decode(memio::string(&buf)!)?; 209 }; 210 case => yield; 211 }; 212 213 return (host, port, userinfo); 214 }; 215 216 type path_mode = enum { 217 ABSOLUTE, 218 ROOTLESS, 219 }; 220 221 fn parse_path(in: *strings::iterator, mode: path_mode) (str | invalid) = { 222 let copy = *in; 223 // With rootless path, we need at least one segment 224 if (mode == path_mode::ROOTLESS) { 225 for (let i = 0z; true; i += 1) { 226 match (strings::next(in)) { 227 case let r: rune => 228 if (r == '?' || r == '#') { 229 strings::prev(in); 230 break; 231 }; 232 if (r == '/') { 233 if (i == 0) { 234 return invalid; 235 } else { 236 break; 237 }; 238 }; 239 if (!is_pchar(r)) { 240 return invalid; 241 }; 242 case void => 243 break; 244 }; 245 }; 246 }; 247 248 for (true) { 249 match (strings::next(in)) { 250 case let r: rune => 251 if (r == '?' || r == '#') { 252 strings::prev(in); 253 break; 254 }; 255 if (!is_pchar(r) && r != '/') { 256 return invalid; 257 }; 258 case void => 259 break; 260 }; 261 }; 262 263 return percent_decode(strings::slice(©, in)); 264 }; 265 266 fn parse_query(in: *strings::iterator) (str | invalid) = { 267 let copy = *in; 268 for (true) { 269 match (strings::next(in)) { 270 case let r: rune => 271 if (r == '#') { 272 strings::prev(in); 273 break; 274 }; 275 if (!is_pchar(r) && r != '/' && r != '?') { 276 return invalid; 277 }; 278 case void => 279 break; 280 }; 281 }; 282 return strings::dup(strings::slice(©, in)); 283 }; 284 285 fn parse_fragment(in: *strings::iterator) (str | invalid) = { 286 let copy = *in; 287 for (true) { 288 match (strings::next(in)) { 289 case let r: rune => 290 if (!is_pchar(r) && r != '/' && r != '?') { 291 return invalid; 292 }; 293 case void => 294 break; 295 }; 296 }; 297 298 return percent_decode(strings::slice(©, in))?; 299 }; 300 301 fn parse_port(in: *strings::iterator) (u16 | invalid) = { 302 let copy = *in; 303 for (true) { 304 const r = match (strings::next(in)) { 305 case let r: rune => 306 yield r; 307 case void => 308 break; 309 }; 310 311 if (!ascii::isdigit(r)) { 312 strings::prev(in); 313 break; 314 }; 315 }; 316 317 match (strconv::stou16(strings::slice(©, in))) { 318 case let port: u16 => 319 if (port == 0) { 320 // There's no port 0 321 return invalid; 322 }; 323 return port; 324 case => 325 return invalid; 326 }; 327 }; 328 329 fn percent_decode(s: str) (str | invalid) = { 330 let buf = memio::dynamic(); 331 percent_decode_static(&buf, s)?; 332 return memio::string(&buf)!; 333 }; 334 335 fn percent_decode_static(out: io::handle, s: str) (void | invalid) = { 336 let iter = strings::iter(s); 337 let tmp = memio::dynamic(); 338 defer io::close(&tmp)!; 339 let percent_data: []u8 = []; 340 for (true) { 341 match (strings::next(&iter)) { 342 case let r: rune => 343 if (r == '%') { 344 memio::reset(&tmp); 345 for (let i = 0z; i < 2; i += 1) { 346 const r = wantrune(&iter)?; 347 memio::appendrune(&tmp, r)!; 348 }; 349 350 match (strconv::stou8b(memio::string(&tmp)!, 351 strconv::base::HEX)) { 352 case let ord: u8 => 353 append(percent_data, ord); 354 case => 355 return invalid; 356 }; 357 } else { 358 if(len(percent_data) > 0) { 359 match(strings::fromutf8(percent_data)) { 360 case let stro: str => 361 memio::concat(out, stro)!; 362 case utf8::invalid => 363 return invalid; 364 }; 365 366 percent_data = []; 367 }; 368 369 memio::appendrune(out, r)!; 370 }; 371 case void => 372 if(len(percent_data) > 0) { 373 match(strings::fromutf8(percent_data)) { 374 case let stro: str => 375 memio::concat(out, stro)!; 376 case utf8::invalid => 377 return invalid; 378 }; 379 380 percent_data = []; 381 }; 382 383 break; 384 }; 385 }; 386 }; 387 388 fn wantrune(iter: *strings::iterator) (rune | invalid) = { 389 match (strings::next(iter)) { 390 case let r: rune => 391 return r; 392 case => 393 return invalid; 394 }; 395 }; 396 397 fn is_userinfo(r: rune) bool = 398 // unreserved + sub-delim + ":" 399 ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:", r) 400 // %-encoded 401 || r == '%' || ascii::isxdigit(r); 402 403 fn is_host(r: rune) bool = 404 // unreserved + sub-delim 405 ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=", r) 406 // %-encoded 407 || r == '%' || ascii::isxdigit(r); 408 409 fn is_pchar(r: rune) bool = 410 // unreserved + sub-delim + ":"/"@" 411 ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:@", r) 412 // %-encoded 413 || r == '%' || ascii::isxdigit(r);