hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

strings.ha (7986B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use encoding::utf8;
      5 use endian;
      6 use errors;
      7 use io;
      8 use strings;
      9 
     10 
     11 // numeric string
     12 def N: u8 = 0o1;
     13 
     14 // printable string
     15 def P: u8 = 0o2;
     16 
     17 // LUT of bitfields with character attributes
     18 const cclass: [_]u8 = [
     19 //	 0	 1	 2	 3	 4	 5	 6	 7
     20 	0,	0,	0,	0,	0,	0,	0,	0,	// 0
     21 	0,	0,	0,	0,	0,	0,	0,	0,	// 10
     22 	0,	0,	0,	0,	0,	0,	0,	0,	// 20
     23 	0,	0,	0,	0,	0,	0,	0,	0,	// 30
     24 	N|P,	0,	0,	0,	0,	0,	0,	P,	// 40
     25 	P,	P,	0,	P,	P,	P,	P,	P,	// 50
     26 	N|P,	N|P,	N|P,	N|P,	N|P,	N|P,	N|P,	N|P,	// 60
     27 	N|P,	N|P,	P,	0,	0,	P,	0,	P,	// 70
     28 	0,	P,	P,	P,	P,	P,	P,	P,	// 100
     29 	P,	P,	P,	P,	P,	P,	P,	P,	// 110
     30 	P,	P,	P,	P,	P,	P,	P,	P,	// 120
     31 	P,	P,	P,	0,	0,	0,	0,	0,	// 130
     32 	0,	P,	P,	P,	P,	P,	P,	P,	// 140
     33 	P,	P,	P,	P,	P,	P,	P,	P,	// 150
     34 	P,	P,	P,	P,	P,	P,	P,	P,	// 160
     35 	P,	P,	P,	0,	0,	0,	0,	0,	// 170
     36 ];
     37 
     38 type char_validator = fn (c: u8) bool;
     39 
     40 // Whether 'c' is valid in a NumericString
     41 fn c_is_num(c: u8) bool = c & 0x80 == 0 && cclass[c] & N != 0;
     42 
     43 // Whether 'c' is valid in a PrintableString
     44 fn c_is_print(c: u8) bool = c & 0x80 == 0 && cclass[c] & P != 0;
     45 
     46 fn c_is_ia5(c: u8) bool = c & 0x80 == 0;
     47 
     48 // Returns the number of bytes of the biggest complete utf8 chunk. Returns
     49 // invalid, if the biggest complete chunk contains invalid utf8 characters.
     50 fn validutf8(buf: []u8) (size | invalid) = {
     51 	if (len(buf) == 0) {
     52 		return 0z;
     53 	};
     54 
     55 	const min = if (len(buf) < 4) 0z else len(buf) - 4;
     56 
     57 	let lastvalid = 0z;
     58 	let lastsz = 0z;
     59 	for (let i = min; i < len(buf); i += 1) {
     60 		match (utf8::utf8sz(buf[i])) {
     61 		case utf8::invalid => void;
     62 		case let s: size =>
     63 			lastsz = s;
     64 			lastvalid = i;
     65 		};
     66 	};
     67 
     68 	if (lastsz == 0) return invalid;
     69 
     70 	const n = if (len(buf) - lastvalid == lastsz) len(buf) else lastvalid;
     71 	if (utf8::validate(buf[..n]) is utf8::invalid) {
     72 		return invalid;
     73 	};
     74 
     75 	return n;
     76 };
     77 
     78 @test fn validutf8() void = {
     79 	let b: [_]u8 = [
     80 		0x55, 0x56, 0xd0, 0x98, 0xe0, 0xa4, 0xb9, 0xf0, 0x90, 0x8d, 0x88
     81 	];
     82 	const runesat: [_]size = [0, 1, 2, 2, 4, 4, 4, 7, 7, 7, 7, 8];
     83 
     84 	for (let i = 0z; i < len(b); i += 1) {
     85 		assert(validutf8(b[..i])! == runesat[i]);
     86 	};
     87 
     88 	b[10] = 0x55;
     89 	assert(validutf8(b[..10])! == 7);
     90 	assert(validutf8(b) is invalid);
     91 };
     92 
     93 // An io::stream reader that returns only valid utf8 chunks on read.
     94 export type utf8stream = struct {
     95 	stream: io::stream,
     96 	d: *decoder,
     97 	strdec: *strdecoder,
     98 };
     99 
    100 const utf8stream_vtable = io::vtable {
    101 	reader = &utf8stream_reader,
    102 	...
    103 };
    104 
    105 fn utf8stream_reader(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
    106 	// at least a rune must fit in buf
    107 	assert(len(buf) >= 4);
    108 	let s = s: *utf8stream;
    109 	let cur = match (s.d.cur) {
    110 	case void =>
    111 		abort();
    112 	case let dh: head =>
    113 		yield dh;
    114 	};
    115 
    116 	match (s.strdec(s, buf)?) {
    117 	case let n: size =>
    118 		return n;
    119 	case io::EOF =>
    120 		return io::EOF;
    121 	};
    122 };
    123 
    124 export type strdecoder = fn(
    125 	s: *utf8stream,
    126 	buf: []u8,
    127 ) (size | io::EOF | io::error);
    128 
    129 fn no_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    130 	dataread(s.d, buf);
    131 
    132 fn char_decoder(
    133 	s: *utf8stream, buf: []u8,
    134 	v: *char_validator,
    135 ) (size | io::EOF | io::error) = {
    136 	let n = match (dataread(s.d, buf)?) {
    137 	case let n: size =>
    138 		yield n;
    139 	case io::EOF =>
    140 		return io::EOF;
    141 	};
    142 
    143 	for (let i = 0z; i < n; i += 1) {
    144 		if (!v(buf[i])) return wrap_err(invalid);
    145 	};
    146 	return n;
    147 };
    148 
    149 fn num_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    150 	char_decoder(s, buf, &c_is_num);
    151 
    152 fn print_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    153 	char_decoder(s, buf, &c_is_print);
    154 
    155 fn ia5_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    156 	char_decoder(s, buf, &c_is_ia5);
    157 
    158 fn utf8_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    159 	let n = 0z;
    160 
    161 	n += match (dataread(s.d, buf)?) {
    162 	case let sz: size =>
    163 		yield sz;
    164 	case io::EOF =>
    165 		if (s.d.unbufn > 0) return wrap_err(invalid);
    166 		return io::EOF;
    167 	};
    168 
    169 	const max = match (validutf8(buf[..n])) {
    170 	case let s: size =>
    171 		yield s;
    172 	case invalid =>
    173 		return wrap_err(invalid);
    174 	};
    175 
    176 	if (max < n) {
    177 		if (dataeof(s.d)) {
    178 			// string ends with incomplete rune
    179 			return wrap_err(invalid);
    180 		};
    181 		dataunread(s.d, buf[max..n]);
    182 		return max;
    183 	};
    184 
    185 	return n;
    186 };
    187 
    188 // A bmp string is an UTF-16 string.
    189 fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    190 	const max = len(buf) - (len(buf) % 2);
    191 
    192 	// TODO disallow control functions (X.690: 8.23.9)
    193 
    194 	let n = 0z;
    195 	let rbuf: [2]u8 = [0...];
    196 	for (true) {
    197 		match (dataread(s.d, rbuf)?) {
    198 		case let sz: size =>
    199 			if (sz < 2) return wrap_err(invalid);
    200 		case io::EOF =>
    201 			return if (n == 0) io::EOF else n;
    202 		};
    203 
    204 		let r = endian::begetu16(rbuf): rune;
    205 		let rb = utf8::encoderune(r);
    206 		if (len(buf) - n < len(rb)) {
    207 			dataunread(s.d, rbuf);
    208 			return n;
    209 		};
    210 
    211 		buf[n..n + len(rb)] = rb;
    212 		n += len(rb);
    213 	};
    214 };
    215 
    216 // Universal string is an UTF32BE string.
    217 fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    218 	const max = len(buf) - (len(buf) % 4);
    219 
    220 	let n = 0z;
    221 	let rbuf: [4]u8 = [0...];
    222 	for (true) {
    223 		match (dataread(s.d, rbuf)?) {
    224 		case let sz: size =>
    225 			if (sz < 4) return wrap_err(invalid);
    226 		case io::EOF =>
    227 			return if (n == 0) io::EOF else n;
    228 		};
    229 
    230 		let r = endian::begetu32(rbuf): rune;
    231 		let rb = utf8::encoderune(r);
    232 		if (len(buf) - n < len(rb)) {
    233 			dataunread(s.d, rbuf);
    234 			return n;
    235 		};
    236 
    237 		buf[n..n + len(rb)] = rb;
    238 		n += len(rb);
    239 	};
    240 };
    241 
    242 fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    243 	let inbuf: [2]u8 = [0...];
    244 	let in = inbuf[..0];
    245 
    246 	let n = 0z;
    247 
    248 	for (true) {
    249 		let chr: [1]u8 = [0];
    250 		match (dataread(s.d, chr)?) {
    251 		case let sz: size =>
    252 			assert(sz == 1);
    253 			static append(in, chr[0]);
    254 		case io::EOF =>
    255 			if (len(in) > 0) return wrap_err(invalid);
    256 			if (n > 0) return n;
    257 			return io::EOF;
    258 		};
    259 
    260 		match (t61_chardecode(in)) {
    261 		case let r: rune =>
    262 			let raw = utf8::encoderune(r);
    263 			const bufremain = len(buf) - n;
    264 			if (len(raw) < bufremain) {
    265 				buf[n..n + len(raw)] = raw[..];
    266 				n += len(raw);
    267 				in = inbuf[..0];
    268 			} else {
    269 				dataunread(s.d, in);
    270 				break;
    271 			};
    272 		case insufficient =>
    273 			// leave combining char in in
    274 			void;
    275 		case invalid =>
    276 			return wrap_err(invalid);
    277 		};
    278 	};
    279 
    280 	return n;
    281 };
    282 
    283 fn newstrreader(d: *decoder, t: utag) (utf8stream | error) = {
    284 	let strdec: *strdecoder = switch (t) {
    285 	case utag::NUMERIC_STRING =>
    286 		yield &num_decoder;
    287 	case utag::PRINTABLE_STRING =>
    288 		yield &print_decoder;
    289 	case utag::IA5_STRING =>
    290 		yield &ia5_decoder;
    291 	case utag::UTF8_STRING =>
    292 		yield &utf8_decoder;
    293 	case utag::TELETEX_STRING =>
    294 		yield &t61_decoder;
    295 	case utag::BMP_STRING =>
    296 		yield &bmp_decoder;
    297 	case utag::UNIVERSAL_STRING =>
    298 		yield &universal_decoder;
    299 	case =>
    300 		return invalid;
    301 	};
    302 
    303 	return utf8stream {
    304 		stream = &utf8stream_vtable,
    305 		d = d,
    306 		strdec = strdec,
    307 		...
    308 	};
    309 };
    310 
    311 // Returns an [[utf8stream]] for a supported utag 't', which is one of:
    312 //   * utag::NUMERIC_STRING
    313 //   * utag::PRINTABLE_STRING
    314 //   * utag::IA5_STRING
    315 //   * utag::UTF8_STRING
    316 //   * utag::TELETEX_STRING
    317 //   * utag::BMP_STRING
    318 //   * utag::UNIVERSAL_STRING
    319 export fn strreader(d: *decoder, t: utag) (utf8stream | error) = {
    320 	let dh = next(d)?;
    321 	expect_utag(dh, t)?;
    322 	return newstrreader(d, t)!;
    323 };
    324 
    325 // Reads a printable string into 'buf'.
    326 export fn read_printstr(d: *decoder, buf: []u8) (size | error) = {
    327 	let dh = next(d)?;
    328 	expect_utag(dh, utag::PRINTABLE_STRING)?;
    329 
    330 	const n = read_bytes(d, buf)?;
    331 
    332 	for (let i = 0z; i < n; i += 1) {
    333 		if (!c_is_print(buf[i])) {
    334 			return invalid;
    335 		};
    336 	};
    337 	return n;
    338 };
    339 
    340 // Reads an utf8 string into 'buf' and returns a str that borrows from buf.
    341 export fn read_utf8str(d: *decoder, buf: []u8) (str | error) = {
    342 	let dh = next(d)?;
    343 	expect_utag(dh, utag::UTF8_STRING)?;
    344 
    345 	let r = newstrreader(d, utag::UTF8_STRING)!;
    346 	let n = 0z;
    347 
    348 	for (true) {
    349 		n += match (io::read(&r, buf[n..])) {
    350 		case let sz: size =>
    351 			yield sz;
    352 		case io::EOF =>
    353 			break;
    354 		case let e: io::error =>
    355 			return unwrap_err(e);
    356 		};
    357 	};
    358 
    359 	return strings::fromutf8(buf[..n])!;
    360 };