strings.ha (7986B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use encoding::utf8; 5 use endian; 6 use errors; 7 use io; 8 use strings; 9 10 11 // numeric string 12 def N: u8 = 0o1; 13 14 // printable string 15 def P: u8 = 0o2; 16 17 // LUT of bitfields with character attributes 18 const cclass: [_]u8 = [ 19 // 0 1 2 3 4 5 6 7 20 0, 0, 0, 0, 0, 0, 0, 0, // 0 21 0, 0, 0, 0, 0, 0, 0, 0, // 10 22 0, 0, 0, 0, 0, 0, 0, 0, // 20 23 0, 0, 0, 0, 0, 0, 0, 0, // 30 24 N|P, 0, 0, 0, 0, 0, 0, P, // 40 25 P, P, 0, P, P, P, P, P, // 50 26 N|P, N|P, N|P, N|P, N|P, N|P, N|P, N|P, // 60 27 N|P, N|P, P, 0, 0, P, 0, P, // 70 28 0, P, P, P, P, P, P, P, // 100 29 P, P, P, P, P, P, P, P, // 110 30 P, P, P, P, P, P, P, P, // 120 31 P, P, P, 0, 0, 0, 0, 0, // 130 32 0, P, P, P, P, P, P, P, // 140 33 P, P, P, P, P, P, P, P, // 150 34 P, P, P, P, P, P, P, P, // 160 35 P, P, P, 0, 0, 0, 0, 0, // 170 36 ]; 37 38 type char_validator = fn (c: u8) bool; 39 40 // Whether 'c' is valid in a NumericString 41 fn c_is_num(c: u8) bool = c & 0x80 == 0 && cclass[c] & N != 0; 42 43 // Whether 'c' is valid in a PrintableString 44 fn c_is_print(c: u8) bool = c & 0x80 == 0 && cclass[c] & P != 0; 45 46 fn c_is_ia5(c: u8) bool = c & 0x80 == 0; 47 48 // Returns the number of bytes of the biggest complete utf8 chunk. Returns 49 // invalid, if the biggest complete chunk contains invalid utf8 characters. 50 fn validutf8(buf: []u8) (size | invalid) = { 51 if (len(buf) == 0) { 52 return 0z; 53 }; 54 55 const min = if (len(buf) < 4) 0z else len(buf) - 4; 56 57 let lastvalid = 0z; 58 let lastsz = 0z; 59 for (let i = min; i < len(buf); i += 1) { 60 match (utf8::utf8sz(buf[i])) { 61 case utf8::invalid => void; 62 case let s: size => 63 lastsz = s; 64 lastvalid = i; 65 }; 66 }; 67 68 if (lastsz == 0) return invalid; 69 70 const n = if (len(buf) - lastvalid == lastsz) len(buf) else lastvalid; 71 if (utf8::validate(buf[..n]) is utf8::invalid) { 72 return invalid; 73 }; 74 75 return n; 76 }; 77 78 @test fn validutf8() void = { 79 let b: [_]u8 = [ 80 0x55, 0x56, 0xd0, 0x98, 0xe0, 0xa4, 0xb9, 0xf0, 0x90, 0x8d, 0x88 81 ]; 82 const runesat: [_]size = [0, 1, 2, 2, 4, 4, 4, 7, 7, 7, 7, 8]; 83 84 for (let i = 0z; i < len(b); i += 1) { 85 assert(validutf8(b[..i])! == runesat[i]); 86 }; 87 88 b[10] = 0x55; 89 assert(validutf8(b[..10])! == 7); 90 assert(validutf8(b) is invalid); 91 }; 92 93 // An io::stream reader that returns only valid utf8 chunks on read. 94 export type utf8stream = struct { 95 stream: io::stream, 96 d: *decoder, 97 strdec: *strdecoder, 98 }; 99 100 const utf8stream_vtable = io::vtable { 101 reader = &utf8stream_reader, 102 ... 103 }; 104 105 fn utf8stream_reader(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = { 106 // at least a rune must fit in buf 107 assert(len(buf) >= 4); 108 let s = s: *utf8stream; 109 let cur = match (s.d.cur) { 110 case void => 111 abort(); 112 case let dh: head => 113 yield dh; 114 }; 115 116 match (s.strdec(s, buf)?) { 117 case let n: size => 118 return n; 119 case io::EOF => 120 return io::EOF; 121 }; 122 }; 123 124 export type strdecoder = fn( 125 s: *utf8stream, 126 buf: []u8, 127 ) (size | io::EOF | io::error); 128 129 fn no_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 130 dataread(s.d, buf); 131 132 fn char_decoder( 133 s: *utf8stream, buf: []u8, 134 v: *char_validator, 135 ) (size | io::EOF | io::error) = { 136 let n = match (dataread(s.d, buf)?) { 137 case let n: size => 138 yield n; 139 case io::EOF => 140 return io::EOF; 141 }; 142 143 for (let i = 0z; i < n; i += 1) { 144 if (!v(buf[i])) return wrap_err(invalid); 145 }; 146 return n; 147 }; 148 149 fn num_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 150 char_decoder(s, buf, &c_is_num); 151 152 fn print_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 153 char_decoder(s, buf, &c_is_print); 154 155 fn ia5_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = 156 char_decoder(s, buf, &c_is_ia5); 157 158 fn utf8_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 159 let n = 0z; 160 161 n += match (dataread(s.d, buf)?) { 162 case let sz: size => 163 yield sz; 164 case io::EOF => 165 if (s.d.unbufn > 0) return wrap_err(invalid); 166 return io::EOF; 167 }; 168 169 const max = match (validutf8(buf[..n])) { 170 case let s: size => 171 yield s; 172 case invalid => 173 return wrap_err(invalid); 174 }; 175 176 if (max < n) { 177 if (dataeof(s.d)) { 178 // string ends with incomplete rune 179 return wrap_err(invalid); 180 }; 181 dataunread(s.d, buf[max..n]); 182 return max; 183 }; 184 185 return n; 186 }; 187 188 // A bmp string is an UTF-16 string. 189 fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 190 const max = len(buf) - (len(buf) % 2); 191 192 // TODO disallow control functions (X.690: 8.23.9) 193 194 let n = 0z; 195 let rbuf: [2]u8 = [0...]; 196 for (true) { 197 match (dataread(s.d, rbuf)?) { 198 case let sz: size => 199 if (sz < 2) return wrap_err(invalid); 200 case io::EOF => 201 return if (n == 0) io::EOF else n; 202 }; 203 204 let r = endian::begetu16(rbuf): rune; 205 let rb = utf8::encoderune(r); 206 if (len(buf) - n < len(rb)) { 207 dataunread(s.d, rbuf); 208 return n; 209 }; 210 211 buf[n..n + len(rb)] = rb; 212 n += len(rb); 213 }; 214 }; 215 216 // Universal string is an UTF32BE string. 217 fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 218 const max = len(buf) - (len(buf) % 4); 219 220 let n = 0z; 221 let rbuf: [4]u8 = [0...]; 222 for (true) { 223 match (dataread(s.d, rbuf)?) { 224 case let sz: size => 225 if (sz < 4) return wrap_err(invalid); 226 case io::EOF => 227 return if (n == 0) io::EOF else n; 228 }; 229 230 let r = endian::begetu32(rbuf): rune; 231 let rb = utf8::encoderune(r); 232 if (len(buf) - n < len(rb)) { 233 dataunread(s.d, rbuf); 234 return n; 235 }; 236 237 buf[n..n + len(rb)] = rb; 238 n += len(rb); 239 }; 240 }; 241 242 fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = { 243 let inbuf: [2]u8 = [0...]; 244 let in = inbuf[..0]; 245 246 let n = 0z; 247 248 for (true) { 249 let chr: [1]u8 = [0]; 250 match (dataread(s.d, chr)?) { 251 case let sz: size => 252 assert(sz == 1); 253 static append(in, chr[0]); 254 case io::EOF => 255 if (len(in) > 0) return wrap_err(invalid); 256 if (n > 0) return n; 257 return io::EOF; 258 }; 259 260 match (t61_chardecode(in)) { 261 case let r: rune => 262 let raw = utf8::encoderune(r); 263 const bufremain = len(buf) - n; 264 if (len(raw) < bufremain) { 265 buf[n..n + len(raw)] = raw[..]; 266 n += len(raw); 267 in = inbuf[..0]; 268 } else { 269 dataunread(s.d, in); 270 break; 271 }; 272 case insufficient => 273 // leave combining char in in 274 void; 275 case invalid => 276 return wrap_err(invalid); 277 }; 278 }; 279 280 return n; 281 }; 282 283 fn newstrreader(d: *decoder, t: utag) (utf8stream | error) = { 284 let strdec: *strdecoder = switch (t) { 285 case utag::NUMERIC_STRING => 286 yield &num_decoder; 287 case utag::PRINTABLE_STRING => 288 yield &print_decoder; 289 case utag::IA5_STRING => 290 yield &ia5_decoder; 291 case utag::UTF8_STRING => 292 yield &utf8_decoder; 293 case utag::TELETEX_STRING => 294 yield &t61_decoder; 295 case utag::BMP_STRING => 296 yield &bmp_decoder; 297 case utag::UNIVERSAL_STRING => 298 yield &universal_decoder; 299 case => 300 return invalid; 301 }; 302 303 return utf8stream { 304 stream = &utf8stream_vtable, 305 d = d, 306 strdec = strdec, 307 ... 308 }; 309 }; 310 311 // Returns an [[utf8stream]] for a supported utag 't', which is one of: 312 // * utag::NUMERIC_STRING 313 // * utag::PRINTABLE_STRING 314 // * utag::IA5_STRING 315 // * utag::UTF8_STRING 316 // * utag::TELETEX_STRING 317 // * utag::BMP_STRING 318 // * utag::UNIVERSAL_STRING 319 export fn strreader(d: *decoder, t: utag) (utf8stream | error) = { 320 let dh = next(d)?; 321 expect_utag(dh, t)?; 322 return newstrreader(d, t)!; 323 }; 324 325 // Reads a printable string into 'buf'. 326 export fn read_printstr(d: *decoder, buf: []u8) (size | error) = { 327 let dh = next(d)?; 328 expect_utag(dh, utag::PRINTABLE_STRING)?; 329 330 const n = read_bytes(d, buf)?; 331 332 for (let i = 0z; i < n; i += 1) { 333 if (!c_is_print(buf[i])) { 334 return invalid; 335 }; 336 }; 337 return n; 338 }; 339 340 // Reads an utf8 string into 'buf' and returns a str that borrows from buf. 341 export fn read_utf8str(d: *decoder, buf: []u8) (str | error) = { 342 let dh = next(d)?; 343 expect_utag(dh, utag::UTF8_STRING)?; 344 345 let r = newstrreader(d, utag::UTF8_STRING)!; 346 let n = 0z; 347 348 for (true) { 349 n += match (io::read(&r, buf[n..])) { 350 case let sz: size => 351 yield sz; 352 case io::EOF => 353 break; 354 case let e: io::error => 355 return unwrap_err(e); 356 }; 357 }; 358 359 return strings::fromutf8(buf[..n])!; 360 };