parse.ha (7966B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use ascii; 5 use encoding::utf8; 6 use io; 7 use memio; 8 use net::ip; 9 use strconv; 10 use strings; 11 12 // The URI provided to [[parse]] is invalid. 13 export type invalid = !void; 14 15 // Parses a URI string into [[uri]] structure. The return value must be freed 16 // using [[finish]]. 17 export fn parse(in: str) (uri | invalid) = { 18 let in = strings::iter(in); 19 20 const scheme = parse_scheme(&in)?; 21 22 // Determine hier-part variant 23 let path = ""; 24 let authority: ((str | ip::addr6), u16, str) = ("", 0u16, ""); 25 match (strings::next(&in)) { 26 case let r: rune => 27 switch (r) { 28 case '/' => 29 // Either "//"+authority+path-abempty or path-absolute 30 switch (wantrune(&in)?) { 31 case '/' => 32 // "//" + authority + path-abempty 33 authority = parse_authority(&in)?; 34 match (strings::next(&in)) { 35 case let r: rune => 36 switch (r) { 37 case '?', '#' => 38 // path-empty 39 strings::prev(&in); 40 case '/' => 41 // path-absolute 42 strings::prev(&in); 43 path = parse_path(&in, 44 path_mode::ABSOLUTE)?; 45 case => 46 return invalid; 47 }; 48 case => void; // path-empty 49 }; 50 case => 51 // path-absolute 52 strings::prev(&in); 53 path = parse_path(&in, path_mode::ABSOLUTE)?; 54 }; 55 case => 56 // path-rootless 57 strings::prev(&in); 58 path = parse_path(&in, path_mode::ROOTLESS)?; 59 }; 60 case => void; // path-empty 61 }; 62 63 let query = ""; 64 match (strings::next(&in)) { 65 case let r: rune => 66 if (r == '?') { 67 query = parse_query(&in)?; 68 } else { 69 strings::prev(&in); 70 }; 71 case => void; 72 }; 73 74 let fragment = ""; 75 match (strings::next(&in)) { 76 case let r: rune => 77 if (r == '#') { 78 fragment = parse_fragment(&in)?; 79 } else { 80 strings::prev(&in); 81 }; 82 case => void; 83 }; 84 85 return uri { 86 scheme = scheme, 87 88 host = match (authority.0) { 89 case let ip: ip::addr6 => 90 yield ip; 91 case let s: str => 92 yield match (ip::parse(s)) { 93 case let a: ip::addr => 94 yield a; 95 case ip::invalid => 96 yield s; 97 }; 98 }, 99 port = authority.1, 100 userinfo = authority.2, 101 102 path = path, 103 query = query, 104 fragment = fragment, 105 }; 106 }; 107 108 fn parse_scheme(in: *strings::iterator) (str | invalid) = { 109 let copy = *in; 110 for (let i = 0z; true; i += 1) { 111 const r = wantrune(in)?; 112 if (i > 0 && r == ':') { 113 strings::prev(in); 114 break; 115 }; 116 if (i == 0) { 117 if (!ascii::isalpha(r)) { 118 return invalid; 119 }; 120 } else { 121 if (!ascii::isalnum(r) && !strings::contains("+-.", r)) { 122 return invalid; 123 }; 124 }; 125 }; 126 let s = strings::dup(strings::slice(©, in)); 127 strings::next(in); 128 return s; 129 }; 130 131 fn parse_authority( 132 in: *strings::iterator, 133 ) (((str | ip::addr6), u16, str) | invalid) = { 134 // Scan everything until '@' or ':' or '/', then decide what it is 135 let buf = memio::dynamic(); 136 defer io::close(&buf)!; 137 let host: (str | ip::addr6) = ""; 138 let port = 0u16; 139 let userinfo = ""; 140 let has_userinfo = false; 141 142 for (let r => strings::next(in)) { 143 if (r == '[') { 144 if (len(memio::string(&buf)!) > 0) { 145 if (len(userinfo) > 0) { 146 return invalid; 147 } else { 148 userinfo = percent_decode( 149 memio::string(&buf)!)?; 150 }; 151 }; 152 memio::reset(&buf); 153 154 for (true) { 155 const r = wantrune(in)?; 156 if (r == ']') { 157 break; 158 }; 159 memio::appendrune(&buf, r)!; 160 }; 161 162 const addr = percent_decode(memio::string(&buf)!)?; 163 match (ip::parse(addr)) { 164 case let v6: ip::addr6 => 165 host = v6; 166 case => 167 return invalid; 168 }; 169 } else if (r == ':' || !is_userinfo(r) && !is_host(r)) { 170 switch (r) { 171 case '@' => 172 if (has_userinfo) { 173 return invalid; 174 }; 175 // This was userinfo+host[+port] 176 userinfo = percent_decode(memio::string(&buf)!)?; 177 memio::reset(&buf); 178 has_userinfo = true; 179 case '/' => 180 // This was just host 181 strings::prev(in); 182 host = percent_decode(memio::string(&buf)!)?; 183 break; 184 case ':' => 185 // This was host+port 186 host = percent_decode(memio::string(&buf)!)?; 187 port = parse_port(in)?; 188 break; 189 case => 190 return invalid; 191 }; 192 } else { 193 memio::appendrune(&buf, r)!; 194 }; 195 }; 196 197 match (host) { 198 case let s: str => 199 // In end of string case 200 if (len(s) == 0) { 201 host = percent_decode(memio::string(&buf)!)?; 202 }; 203 case => void; 204 }; 205 206 return (host, port, userinfo); 207 }; 208 209 type path_mode = enum { 210 ABSOLUTE, 211 ROOTLESS, 212 }; 213 214 fn parse_path(in: *strings::iterator, mode: path_mode) (str | invalid) = { 215 let copy = *in; 216 // With rootless path, we need at least one segment 217 if (mode == path_mode::ROOTLESS) { 218 for (let i = 0z; true; i += 1) { 219 match (strings::next(in)) { 220 case let r: rune => 221 if (r == '?' || r == '#') { 222 strings::prev(in); 223 break; 224 }; 225 if (r == '/') { 226 if (i == 0) { 227 return invalid; 228 } else { 229 break; 230 }; 231 }; 232 if (!is_pchar(r)) { 233 return invalid; 234 }; 235 case done => 236 break; 237 }; 238 }; 239 }; 240 241 for (let r => strings::next(in)) { 242 if (r == '?' || r == '#') { 243 strings::prev(in); 244 break; 245 }; 246 if (!is_pchar(r) && r != '/') { 247 return invalid; 248 }; 249 }; 250 251 return percent_decode(strings::slice(©, in)); 252 }; 253 254 fn parse_query(in: *strings::iterator) (str | invalid) = { 255 let copy = *in; 256 for (let r => strings::next(in)) { 257 if (r == '#') { 258 strings::prev(in); 259 break; 260 }; 261 if (!is_pchar(r) && r != '/' && r != '?') { 262 return invalid; 263 }; 264 }; 265 return strings::dup(strings::slice(©, in)); 266 }; 267 268 fn parse_fragment(in: *strings::iterator) (str | invalid) = { 269 let copy = *in; 270 for (let r => strings::next(in)) { 271 if (!is_pchar(r) && r != '/' && r != '?') { 272 return invalid; 273 }; 274 }; 275 276 return percent_decode(strings::slice(©, in))?; 277 }; 278 279 fn parse_port(in: *strings::iterator) (u16 | invalid) = { 280 let copy = *in; 281 for (let r => strings::next(in)) { 282 if (!ascii::isdigit(r)) { 283 strings::prev(in); 284 break; 285 }; 286 }; 287 288 match (strconv::stou16(strings::slice(©, in))) { 289 case let port: u16 => 290 if (port == 0) { 291 // There's no port 0 292 return invalid; 293 }; 294 return port; 295 case => 296 return invalid; 297 }; 298 }; 299 300 fn percent_decode(s: str) (str | invalid) = { 301 let buf = memio::dynamic(); 302 percent_decode_static(&buf, s)?; 303 return memio::string(&buf)!; 304 }; 305 306 fn percent_decode_static(out: io::handle, s: str) (void | invalid) = { 307 let iter = strings::iter(s); 308 let tmp = memio::dynamic(); 309 defer io::close(&tmp)!; 310 let percent_data: []u8 = []; 311 for (true) { 312 match (strings::next(&iter)) { 313 case let r: rune => 314 if (r == '%') { 315 memio::reset(&tmp); 316 for (let i = 0z; i < 2; i += 1) { 317 const r = wantrune(&iter)?; 318 memio::appendrune(&tmp, r)!; 319 }; 320 321 match (strconv::stou8(memio::string(&tmp)!, 322 strconv::base::HEX)) { 323 case let ord: u8 => 324 append(percent_data, ord); 325 case => 326 return invalid; 327 }; 328 } else { 329 if(len(percent_data) > 0) { 330 match(strings::fromutf8(percent_data)) { 331 case let stro: str => 332 memio::concat(out, stro)!; 333 case utf8::invalid => 334 return invalid; 335 }; 336 337 percent_data = []; 338 }; 339 340 memio::appendrune(out, r)!; 341 }; 342 case done => 343 if(len(percent_data) > 0) { 344 match(strings::fromutf8(percent_data)) { 345 case let stro: str => 346 memio::concat(out, stro)!; 347 case utf8::invalid => 348 return invalid; 349 }; 350 351 percent_data = []; 352 }; 353 354 break; 355 }; 356 }; 357 }; 358 359 fn wantrune(iter: *strings::iterator) (rune | invalid) = { 360 match (strings::next(iter)) { 361 case let r: rune => 362 return r; 363 case => 364 return invalid; 365 }; 366 }; 367 368 fn is_userinfo(r: rune) bool = 369 // unreserved + sub-delim + ":" 370 ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:", r) 371 // %-encoded 372 || r == '%' || ascii::isxdigit(r); 373 374 fn is_host(r: rune) bool = 375 // unreserved + sub-delim 376 ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=", r) 377 // %-encoded 378 || r == '%' || ascii::isxdigit(r); 379 380 fn is_pchar(r: rune) bool = 381 // unreserved + sub-delim + ":"/"@" 382 ascii::isalnum(r) || strings::contains("-._~!$&'()*+,;=:@", r) 383 // %-encoded 384 || r == '%' || ascii::isxdigit(r);