strings.ha (7978B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use encoding::utf8; 5 use endian; 6 use io; 7 use strings; 8 9 10 // numeric string 11 def N: u8 = 0o1; 12 13 // printable string 14 def P: u8 = 0o2; 15 16 // LUT of bitfields with character attributes 17 const cclass: [_]u8 = [ 18 // 0 1 2 3 4 5 6 7 19 0, 0, 0, 0, 0, 0, 0, 0, // 0 20 0, 0, 0, 0, 0, 0, 0, 0, // 10 21 0, 0, 0, 0, 0, 0, 0, 0, // 20 22 0, 0, 0, 0, 0, 0, 0, 0, // 30 23 N|P, 0, 0, 0, 0, 0, 0, P, // 40 24 P, P, 0, P, P, P, P, P, // 50 25 N|P, N|P, N|P, N|P, N|P, N|P, N|P, N|P, // 60 26 N|P, N|P, P, 0, 0, P, 0, P, // 70 27 0, P, P, P, P, P, P, P, // 100 28 P, P, P, P, P, P, P, P, // 110 29 P, P, P, P, P, P, P, P, // 120 30 P, P, P, 0, 0, 0, 0, 0, // 130 31 0, P, P, P, P, P, P, P, // 140 32 P, P, P, P, P, P, P, P, // 150 33 P, P, P, P, P, P, P, P, // 160 34 P, P, P, 0, 0, 0, 0, 0, // 170 35 ]; 36 37 type char_validator = fn (c: u8) bool; 38 39 // Whether 'c' is valid in a NumericString 40 fn c_is_num(c: u8) bool = c & 0x80 == 0 && cclass[c] & N != 0; 41 42 // Whether 'c' is valid in a PrintableString 43 fn c_is_print(c: u8) bool = c & 0x80 == 0 && cclass[c] & P != 0; 44 45 fn c_is_ia5(c: u8) bool = c & 0x80 == 0; 46 47 // Returns the number of bytes of the biggest complete utf8 chunk. Returns 48 // invalid, if the biggest complete chunk contains invalid utf8 characters. 49 fn validutf8(buf: []u8) (size | invalid) = { 50 if (len(buf) == 0) { 51 return 0z; 52 }; 53 54 const min = if (len(buf) < 4) 0z else len(buf) - 4; 55 56 let lastvalid = 0z; 57 let lastsz = 0z; 58 for (let i = min; i < len(buf); i += 1) { 59 match (utf8::utf8sz(buf[i])) { 60 case utf8::invalid => void; 61 case let s: size => 62 lastsz = s; 63 lastvalid = i; 64 }; 65 }; 66 67 if (lastsz == 0) return invalid; 68 69 const n = if (len(buf) - lastvalid == lastsz) len(buf) else lastvalid; 70 if (utf8::validate(buf[..n]) is utf8::invalid) { 71 return invalid; 72 }; 73 74 return n; 75 }; 76 77 @test fn validutf8() void = { 78 let b: [_]u8 = [ 79 0x55, 0x56, 0xd0, 0x98, 0xe0, 0xa4, 0xb9, 0xf0, 0x90, 0x8d, 0x88 80 ]; 81 const runesat: [_]size = [0, 1, 2, 2, 4, 4, 4, 7, 7, 7, 7, 8]; 82 83 for (let i = 0z; i < len(b); i += 1) { 84 assert(validutf8(b[..i])! == runesat[i]); 85 }; 86 87 b[10] = 0x55; 88 assert(validutf8(b[..10])! == 7); 89 assert(validutf8(b) is invalid); 90 }; 91 92 // An io::stream reader that returns only valid utf8 chunks on read. 93 export type utf8stream = struct { 94 stream: io::stream, 95 d: *decoder, 96 strdec: *strdecoder, 97 }; 98 99 const utf8stream_vtable = io::vtable { 100 reader = &utf8stream_reader, 101 ... 102 }; 103 104 fn utf8stream_reader(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = { 105 // at least a rune must fit in buf 106 assert(len(buf) >= 4); 107 let s = s: *utf8stream; 108 let cur = match (s.d.cur) { 109 case void => 110 abort(); 111 case let dh: head => 112 yield dh; 113 }; 114 115 match (s.strdec(s, buf)?) { 116 case let n: size => 117 return n; 118 case io::EOF => 119 return io::EOF; 120 }; 121 }; 122 123 export type strdecoder = fn( 124 s: *utf8stream, 125 buf: []u8, 126 ) (size | io::EOF | io::error); 127 128 fn no_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 129 dataread(s.d, buf); 130 131 fn char_decoder( 132 s: *utf8stream, buf: []u8, 133 v: *char_validator, 134 ) (size | io::EOF | io::error) = { 135 let n = match (dataread(s.d, buf)?) { 136 case let n: size => 137 yield n; 138 case io::EOF => 139 return io::EOF; 140 }; 141 142 for (let i = 0z; i < n; i += 1) { 143 if (!v(buf[i])) return wrap_err(invalid); 144 }; 145 return n; 146 }; 147 148 fn num_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 149 char_decoder(s, buf, &c_is_num); 150 151 fn print_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 152 char_decoder(s, buf, &c_is_print); 153 154 fn ia5_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 155 char_decoder(s, buf, &c_is_ia5); 156 157 fn utf8_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 158 let n = 0z; 159 160 n += match (dataread(s.d, buf)?) { 161 case let sz: size => 162 yield sz; 163 case io::EOF => 164 if (s.d.unbufn > 0) return wrap_err(invalid); 165 return io::EOF; 166 }; 167 168 const max = match (validutf8(buf[..n])) { 169 case let s: size => 170 yield s; 171 case invalid => 172 return wrap_err(invalid); 173 }; 174 175 if (max < n) { 176 if (dataeof(s.d)) { 177 // string ends with incomplete rune 178 return wrap_err(invalid); 179 }; 180 dataunread(s.d, buf[max..n]); 181 return max; 182 }; 183 184 return n; 185 }; 186 187 // A bmp string is an UTF-16 string. 188 fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 189 const max = len(buf) - (len(buf) % 2); 190 191 // TODO disallow control functions (X.690: 8.23.9) 192 193 let n = 0z; 194 let rbuf: [2]u8 = [0...]; 195 for (true) { 196 match (dataread(s.d, rbuf)?) { 197 case let sz: size => 198 if (sz < 2) return wrap_err(invalid); 199 case io::EOF => 200 return if (n == 0) io::EOF else n; 201 }; 202 203 let r = endian::begetu16(rbuf): rune; 204 let rb = utf8::encoderune(r)!; 205 if (len(buf) - n < len(rb)) { 206 dataunread(s.d, rbuf); 207 return n; 208 }; 209 210 buf[n..n + len(rb)] = rb; 211 n += len(rb); 212 }; 213 }; 214 215 // Universal string is an UTF32BE string. 216 fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 217 const max = len(buf) - (len(buf) % 4); 218 219 let n = 0z; 220 let rbuf: [4]u8 = [0...]; 221 for (true) { 222 match (dataread(s.d, rbuf)?) { 223 case let sz: size => 224 if (sz < 4) return wrap_err(invalid); 225 case io::EOF => 226 return if (n == 0) io::EOF else n; 227 }; 228 229 let r = endian::begetu32(rbuf): rune; 230 let rb = utf8::encoderune(r)!; 231 if (len(buf) - n < len(rb)) { 232 dataunread(s.d, rbuf); 233 return n; 234 }; 235 236 buf[n..n + len(rb)] = rb; 237 n += len(rb); 238 }; 239 }; 240 241 fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 242 let inbuf: [2]u8 = [0...]; 243 let in = inbuf[..0]; 244 245 let n = 0z; 246 247 for (true) { 248 let chr: [1]u8 = [0]; 249 match (dataread(s.d, chr)?) { 250 case let sz: size => 251 assert(sz == 1); 252 static append(in, chr[0])!; 253 case io::EOF => 254 if (len(in) > 0) return wrap_err(invalid); 255 if (n > 0) return n; 256 return io::EOF; 257 }; 258 259 match (t61_chardecode(in)) { 260 case let r: rune => 261 let raw = utf8::encoderune(r)!; 262 const bufremain = len(buf) - n; 263 if (len(raw) < bufremain) { 264 buf[n..n + len(raw)] = raw[..]; 265 n += len(raw); 266 in = inbuf[..0]; 267 } else { 268 dataunread(s.d, in); 269 break; 270 }; 271 case insufficient => 272 // leave combining char in in 273 void; 274 case invalid => 275 return wrap_err(invalid); 276 }; 277 }; 278 279 return n; 280 }; 281 282 fn newstrreader(d: *decoder, t: utag) (utf8stream | error) = { 283 let strdec: *strdecoder = switch (t) { 284 case utag::NUMERIC_STRING => 285 yield &num_decoder; 286 case utag::PRINTABLE_STRING => 287 yield &print_decoder; 288 case utag::IA5_STRING => 289 yield &ia5_decoder; 290 case utag::UTF8_STRING => 291 yield &utf8_decoder; 292 case utag::TELETEX_STRING => 293 yield &t61_decoder; 294 case utag::BMP_STRING => 295 yield &bmp_decoder; 296 case utag::UNIVERSAL_STRING => 297 yield &universal_decoder; 298 case => 299 return invalid; 300 }; 301 302 return utf8stream { 303 stream = &utf8stream_vtable, 304 d = d, 305 strdec = strdec, 306 ... 307 }; 308 }; 309 310 // Returns an [[utf8stream]] for a supported utag 't', which is one of: 311 // * utag::NUMERIC_STRING 312 // * utag::PRINTABLE_STRING 313 // * utag::IA5_STRING 314 // * utag::UTF8_STRING 315 // * utag::TELETEX_STRING 316 // * utag::BMP_STRING 317 // * utag::UNIVERSAL_STRING 318 export fn strreader(d: *decoder, t: utag) (utf8stream | error) = { 319 let dh = next(d)?; 320 expect_utag(dh, t)?; 321 return newstrreader(d, t)!; 322 }; 323 324 // Reads a printable string into 'buf'. 325 export fn read_printstr(d: *decoder, buf: []u8) (size | error) = { 326 let dh = next(d)?; 327 expect_utag(dh, utag::PRINTABLE_STRING)?; 328 329 const n = read_bytes(d, buf)?; 330 331 for (let i = 0z; i < n; i += 1) { 332 if (!c_is_print(buf[i])) { 333 return invalid; 334 }; 335 }; 336 return n; 337 }; 338 339 // Reads an utf8 string into 'buf' and returns a str that borrows from buf. 340 export fn read_utf8str(d: *decoder, buf: []u8) (str | error) = { 341 let dh = next(d)?; 342 expect_utag(dh, utag::UTF8_STRING)?; 343 344 let r = newstrreader(d, utag::UTF8_STRING)!; 345 let n = 0z; 346 347 for (true) { 348 n += match (io::read(&r, buf[n..])) { 349 case let sz: size => 350 yield sz; 351 case io::EOF => 352 break; 353 case let e: io::error => 354 return unwrap_err(e); 355 }; 356 }; 357 358 return strings::fromutf8(buf[..n])!; 359 };