parse.ha (3726B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use ascii; 5 use errors; 6 use strings; 7 8 const tspecial: str = "()<>@,;:\\/[]?="; 9 export type type_params = strings::tokenizer; 10 11 // Parses a Media Type, returning a tuple of the content type (e.g. 12 // "text/plain") and a parameter parser object, or [[errors::invalid]] if the 13 // input cannot be parsed. 14 // 15 // To enumerate the Media Type parameter list, pass the type_params object into 16 // [[next_param]]. If you do not need the parameter list, you can safely discard 17 // the object. Note that any format errors following the ";" token will not 18 // cause [[errors::invalid]] to be returned unless [[next_param]] is used to 19 // enumerate all of the parameters. 20 export fn parse(in: str) ((str, type_params) | errors::invalid) = { 21 const items = strings::cut(in, ";"); 22 const mtype = items.0, params = items.1; 23 const items = strings::cut(mtype, "/"); 24 if (len(items.0) < 1 || len(items.1) < 1) { 25 return errors::invalid; 26 }; 27 typevalid(items.0)?; 28 typevalid(items.1)?; 29 return (mtype, strings::tokenize(params, ";")); 30 }; 31 32 // Returns the next parameter as a (key, value) tuple from a [[type_params]] 33 // object that was prepared via [[parse]], void if there are no remaining 34 // parameters, and [[errors::invalid]] if a syntax error was encountered. 35 export fn next_param(in: *type_params) ((str, str) | void | errors::invalid) = { 36 const tok = match (strings::next_token(in: *strings::tokenizer)) { 37 case let s: str => 38 if (s == "") { 39 // empty parameter 40 return errors::invalid; 41 }; 42 yield s; 43 case done => 44 return; 45 }; 46 47 const items = strings::cut(tok, "="); 48 // The RFC does not permit whitespace here, but whitespace is very 49 // common in the wild. ¯\_(ツ)_/¯ 50 items.0 = strings::trim(items.0); 51 items.1 = strings::trim(items.1); 52 if (len(items.0) == 0 || len(items.1) == 0) { 53 return errors::invalid; 54 }; 55 56 if (strings::hasprefix(items.1, "\"")) { 57 items.1 = quoted(items.1)?; 58 }; 59 60 return (items.0, items.1); 61 }; 62 63 fn quoted(in: str) (str | errors::invalid) = { 64 // We have only a basic implementation of quoted-string. It has a couple 65 // of problems: 66 // 67 // 1. The RFC does not define it very well 68 // 2. The parts of the RFC which are ill-defined are rarely used 69 // 3. Implementing quoted-pair would require allocating a new string 70 // 71 // This implementation should handle most Media Types seen in practice 72 // unless they're doing something weird and ill-advised with them. 73 in = strings::trim(in, '"'); 74 if (strings::contains(in, "\\") 75 || strings::contains(in, "\r") 76 || strings::contains(in, "\n")) { 77 return errors::invalid; 78 }; 79 return in; 80 }; 81 82 fn typevalid(in: str) (void | errors::invalid) = { 83 const miter = strings::iter(in); 84 for (let rn => strings::next(&miter)) { 85 if (!ascii::valid(rn) || rn == ' ' 86 || ascii::iscntrl(rn) 87 || strings::contains(tspecial, rn)) { 88 return errors::invalid; 89 }; 90 }; 91 }; 92 93 @test fn parse() void = { 94 const res = parse("text/plain")!; 95 assert(res.0 == "text/plain"); 96 97 const res = parse("image/png")!; 98 assert(res.0 == "image/png"); 99 100 const res = parse("application/svg+xml; charset=utf-8; foo=\"bar baz\"")!; 101 assert(res.0 == "application/svg+xml"); 102 const params = res.1; 103 const param = next_param(¶ms)! as (str, str); 104 assert(param.0 == "charset" && param.1 == "utf-8"); 105 const param = next_param(¶ms)! as (str, str); 106 assert(param.0 == "foo" && param.1 == "bar baz"); 107 assert(next_param(¶ms) is void); 108 109 assert(parse("hi") is errors::invalid); 110 assert(parse("text/ spaces ") is errors::invalid); 111 assert(parse("text/@") is errors::invalid); 112 113 const res = parse("text/plain;charset")!; 114 assert(res.0 == "text/plain"); 115 assert(next_param(&res.1) is errors::invalid); 116 };