parse.ha (3739B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use ascii; 5 use strings; 6 7 const tspecial: str = "()<>@,;:\\/[]?="; 8 export type type_params = strings::tokenizer; 9 10 // A syntax error. 11 export type syntax = !void; 12 13 // Converts an error into a human-friendly string. 14 export fn strerror(err: syntax) str = "Can't parse Media Type"; 15 16 // Parses a Media Type, returning a tuple of the content type (e.g. 17 // "text/plain") and a parameter parser object, or [[syntax]] if the input 18 // cannot be parsed. 19 // 20 // To enumerate the Media Type parameter list, pass the type_params object into 21 // [[next_param]]. If you do not need the parameter list, you can safely discard 22 // the object. Note that any format errors following the ";" token will not 23 // cause [[syntax]] to be returned unless [[next_param]] is used to enumerate 24 // all of the parameters. 25 export fn parse(in: str) ((str, type_params) | syntax) = { 26 const items = strings::cut(in, ";"); 27 const mtype = items.0, params = items.1; 28 const items = strings::cut(mtype, "/"); 29 if (len(items.0) < 1 || len(items.1) < 1) { 30 return syntax; 31 }; 32 typevalid(items.0)?; 33 typevalid(items.1)?; 34 return (mtype, strings::tokenize(params, ";")); 35 }; 36 37 // Returns the next parameter as a (key, value) tuple from a [[type_params]] 38 // object that was prepared via [[parse]], done if there are no remaining 39 // parameters, and [[syntax]] if a syntax error was encountered. 40 export fn next_param(in: *type_params) ((str, str) | done | syntax) = { 41 const tok = match (strings::next_token(in: *strings::tokenizer)) { 42 case let s: str => 43 if (s == "") { 44 // empty parameter 45 return syntax; 46 }; 47 yield s; 48 case done => 49 return done; 50 }; 51 52 const items = strings::cut(tok, "="); 53 // The RFC does not permit whitespace here, but whitespace is very 54 // common in the wild. ¯\_(ツ)_/¯ 55 items.0 = strings::trim(items.0); 56 items.1 = strings::trim(items.1); 57 if (len(items.0) == 0 || len(items.1) == 0) { 58 return syntax; 59 }; 60 61 if (strings::hasprefix(items.1, "\"")) { 62 items.1 = quoted(items.1)?; 63 }; 64 65 return (items.0, items.1); 66 }; 67 68 fn quoted(in: str) (str | syntax) = { 69 // We have only a basic implementation of quoted-string. It has a couple 70 // of problems: 71 // 72 // 1. The RFC does not define it very well 73 // 2. The parts of the RFC which are ill-defined are rarely used 74 // 3. Implementing quoted-pair would require allocating a new string 75 // 76 // This implementation should handle most Media Types seen in practice 77 // unless they're doing something weird and ill-advised with them. 78 in = strings::trim(in, '"'); 79 if (strings::contains(in, "\\") 80 || strings::contains(in, "\r") 81 || strings::contains(in, "\n")) { 82 return syntax; 83 }; 84 return in; 85 }; 86 87 fn typevalid(in: str) (void | syntax) = { 88 const miter = strings::iter(in); 89 for (let rn => strings::next(&miter)) { 90 if (!ascii::valid(rn) || rn == ' ' 91 || ascii::iscntrl(rn) 92 || strings::contains(tspecial, rn)) { 93 return syntax; 94 }; 95 }; 96 }; 97 98 @test fn parse() void = { 99 const res = parse("text/plain")!; 100 assert(res.0 == "text/plain"); 101 102 const res = parse("image/png")!; 103 assert(res.0 == "image/png"); 104 105 const res = parse("application/svg+xml; charset=utf-8; foo=\"bar baz\"")!; 106 assert(res.0 == "application/svg+xml"); 107 const params = res.1; 108 const param = next_param(¶ms)! as (str, str); 109 assert(param.0 == "charset" && param.1 == "utf-8"); 110 const param = next_param(¶ms)! as (str, str); 111 assert(param.0 == "foo" && param.1 == "bar baz"); 112 assert(next_param(¶ms) is done); 113 114 assert(parse("hi") is syntax); 115 assert(parse("text/ spaces ") is syntax); 116 assert(parse("text/@") is syntax); 117 118 const res = parse("text/plain;charset")!; 119 assert(res.0 == "text/plain"); 120 assert(next_param(&res.1) is syntax); 121 };