hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

strings.ha (7978B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use encoding::utf8;
      5 use endian;
      6 use io;
      7 use strings;
      8 
      9 
     10 // numeric string
     11 def N: u8 = 0o1;
     12 
     13 // printable string
     14 def P: u8 = 0o2;
     15 
     16 // LUT of bitfields with character attributes
     17 const cclass: [_]u8 = [
     18 //	 0	 1	 2	 3	 4	 5	 6	 7
     19 	0,	0,	0,	0,	0,	0,	0,	0,	// 0
     20 	0,	0,	0,	0,	0,	0,	0,	0,	// 10
     21 	0,	0,	0,	0,	0,	0,	0,	0,	// 20
     22 	0,	0,	0,	0,	0,	0,	0,	0,	// 30
     23 	N|P,	0,	0,	0,	0,	0,	0,	P,	// 40
     24 	P,	P,	0,	P,	P,	P,	P,	P,	// 50
     25 	N|P,	N|P,	N|P,	N|P,	N|P,	N|P,	N|P,	N|P,	// 60
     26 	N|P,	N|P,	P,	0,	0,	P,	0,	P,	// 70
     27 	0,	P,	P,	P,	P,	P,	P,	P,	// 100
     28 	P,	P,	P,	P,	P,	P,	P,	P,	// 110
     29 	P,	P,	P,	P,	P,	P,	P,	P,	// 120
     30 	P,	P,	P,	0,	0,	0,	0,	0,	// 130
     31 	0,	P,	P,	P,	P,	P,	P,	P,	// 140
     32 	P,	P,	P,	P,	P,	P,	P,	P,	// 150
     33 	P,	P,	P,	P,	P,	P,	P,	P,	// 160
     34 	P,	P,	P,	0,	0,	0,	0,	0,	// 170
     35 ];
     36 
     37 type char_validator = fn (c: u8) bool;
     38 
     39 // Whether 'c' is valid in a NumericString
     40 fn c_is_num(c: u8) bool = c & 0x80 == 0 && cclass[c] & N != 0;
     41 
     42 // Whether 'c' is valid in a PrintableString
     43 fn c_is_print(c: u8) bool = c & 0x80 == 0 && cclass[c] & P != 0;
     44 
     45 fn c_is_ia5(c: u8) bool = c & 0x80 == 0;
     46 
     47 // Returns the number of bytes of the biggest complete utf8 chunk. Returns
     48 // invalid, if the biggest complete chunk contains invalid utf8 characters.
     49 fn validutf8(buf: []u8) (size | invalid) = {
     50 	if (len(buf) == 0) {
     51 		return 0z;
     52 	};
     53 
     54 	const min = if (len(buf) < 4) 0z else len(buf) - 4;
     55 
     56 	let lastvalid = 0z;
     57 	let lastsz = 0z;
     58 	for (let i = min; i < len(buf); i += 1) {
     59 		match (utf8::utf8sz(buf[i])) {
     60 		case utf8::invalid => void;
     61 		case let s: size =>
     62 			lastsz = s;
     63 			lastvalid = i;
     64 		};
     65 	};
     66 
     67 	if (lastsz == 0) return invalid;
     68 
     69 	const n = if (len(buf) - lastvalid == lastsz) len(buf) else lastvalid;
     70 	if (utf8::validate(buf[..n]) is utf8::invalid) {
     71 		return invalid;
     72 	};
     73 
     74 	return n;
     75 };
     76 
     77 @test fn validutf8() void = {
     78 	let b: [_]u8 = [
     79 		0x55, 0x56, 0xd0, 0x98, 0xe0, 0xa4, 0xb9, 0xf0, 0x90, 0x8d, 0x88
     80 	];
     81 	const runesat: [_]size = [0, 1, 2, 2, 4, 4, 4, 7, 7, 7, 7, 8];
     82 
     83 	for (let i = 0z; i < len(b); i += 1) {
     84 		assert(validutf8(b[..i])! == runesat[i]);
     85 	};
     86 
     87 	b[10] = 0x55;
     88 	assert(validutf8(b[..10])! == 7);
     89 	assert(validutf8(b) is invalid);
     90 };
     91 
     92 // An io::stream reader that returns only valid utf8 chunks on read.
     93 export type utf8stream = struct {
     94 	stream: io::stream,
     95 	d: *decoder,
     96 	strdec: *strdecoder,
     97 };
     98 
     99 const utf8stream_vtable = io::vtable {
    100 	reader = &utf8stream_reader,
    101 	...
    102 };
    103 
    104 fn utf8stream_reader(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
    105 	// at least a rune must fit in buf
    106 	assert(len(buf) >= 4);
    107 	let s = s: *utf8stream;
    108 	let cur = match (s.d.cur) {
    109 	case void =>
    110 		abort();
    111 	case let dh: head =>
    112 		yield dh;
    113 	};
    114 
    115 	match (s.strdec(s, buf)?) {
    116 	case let n: size =>
    117 		return n;
    118 	case io::EOF =>
    119 		return io::EOF;
    120 	};
    121 };
    122 
    123 export type strdecoder = fn(
    124 	s: *utf8stream,
    125 	buf: []u8,
    126 ) (size | io::EOF | io::error);
    127 
    128 fn no_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    129 	dataread(s.d, buf);
    130 
    131 fn char_decoder(
    132 	s: *utf8stream, buf: []u8,
    133 	v: *char_validator,
    134 ) (size | io::EOF | io::error) = {
    135 	let n = match (dataread(s.d, buf)?) {
    136 	case let n: size =>
    137 		yield n;
    138 	case io::EOF =>
    139 		return io::EOF;
    140 	};
    141 
    142 	for (let i = 0z; i < n; i += 1) {
    143 		if (!v(buf[i])) return wrap_err(invalid);
    144 	};
    145 	return n;
    146 };
    147 
    148 fn num_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    149 	char_decoder(s, buf, &c_is_num);
    150 
    151 fn print_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    152 	char_decoder(s, buf, &c_is_print);
    153 
    154 fn ia5_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) =
    155 	char_decoder(s, buf, &c_is_ia5);
    156 
    157 fn utf8_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    158 	let n = 0z;
    159 
    160 	n += match (dataread(s.d, buf)?) {
    161 	case let sz: size =>
    162 		yield sz;
    163 	case io::EOF =>
    164 		if (s.d.unbufn > 0) return wrap_err(invalid);
    165 		return io::EOF;
    166 	};
    167 
    168 	const max = match (validutf8(buf[..n])) {
    169 	case let s: size =>
    170 		yield s;
    171 	case invalid =>
    172 		return wrap_err(invalid);
    173 	};
    174 
    175 	if (max < n) {
    176 		if (dataeof(s.d)) {
    177 			// string ends with incomplete rune
    178 			return wrap_err(invalid);
    179 		};
    180 		dataunread(s.d, buf[max..n]);
    181 		return max;
    182 	};
    183 
    184 	return n;
    185 };
    186 
    187 // A bmp string is an UTF-16 string.
    188 fn bmp_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    189 	const max = len(buf) - (len(buf) % 2);
    190 
    191 	// TODO disallow control functions (X.690: 8.23.9)
    192 
    193 	let n = 0z;
    194 	let rbuf: [2]u8 = [0...];
    195 	for (true) {
    196 		match (dataread(s.d, rbuf)?) {
    197 		case let sz: size =>
    198 			if (sz < 2) return wrap_err(invalid);
    199 		case io::EOF =>
    200 			return if (n == 0) io::EOF else n;
    201 		};
    202 
    203 		let r = endian::begetu16(rbuf): rune;
    204 		let rb = utf8::encoderune(r)!;
    205 		if (len(buf) - n < len(rb)) {
    206 			dataunread(s.d, rbuf);
    207 			return n;
    208 		};
    209 
    210 		buf[n..n + len(rb)] = rb;
    211 		n += len(rb);
    212 	};
    213 };
    214 
    215 // Universal string is an UTF32BE string.
    216 fn universal_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    217 	const max = len(buf) - (len(buf) % 4);
    218 
    219 	let n = 0z;
    220 	let rbuf: [4]u8 = [0...];
    221 	for (true) {
    222 		match (dataread(s.d, rbuf)?) {
    223 		case let sz: size =>
    224 			if (sz < 4) return wrap_err(invalid);
    225 		case io::EOF =>
    226 			return if (n == 0) io::EOF else n;
    227 		};
    228 
    229 		let r = endian::begetu32(rbuf): rune;
    230 		let rb = utf8::encoderune(r)!;
    231 		if (len(buf) - n < len(rb)) {
    232 			dataunread(s.d, rbuf);
    233 			return n;
    234 		};
    235 
    236 		buf[n..n + len(rb)] = rb;
    237 		n += len(rb);
    238 	};
    239 };
    240 
    241 fn t61_decoder(s: *utf8stream, buf: []u8) (size | io::EOF | io::error) = {
    242 	let inbuf: [2]u8 = [0...];
    243 	let in = inbuf[..0];
    244 
    245 	let n = 0z;
    246 
    247 	for (true) {
    248 		let chr: [1]u8 = [0];
    249 		match (dataread(s.d, chr)?) {
    250 		case let sz: size =>
    251 			assert(sz == 1);
    252 			static append(in, chr[0])!;
    253 		case io::EOF =>
    254 			if (len(in) > 0) return wrap_err(invalid);
    255 			if (n > 0) return n;
    256 			return io::EOF;
    257 		};
    258 
    259 		match (t61_chardecode(in)) {
    260 		case let r: rune =>
    261 			let raw = utf8::encoderune(r)!;
    262 			const bufremain = len(buf) - n;
    263 			if (len(raw) < bufremain) {
    264 				buf[n..n + len(raw)] = raw[..];
    265 				n += len(raw);
    266 				in = inbuf[..0];
    267 			} else {
    268 				dataunread(s.d, in);
    269 				break;
    270 			};
    271 		case insufficient =>
    272 			// leave combining char in in
    273 			void;
    274 		case invalid =>
    275 			return wrap_err(invalid);
    276 		};
    277 	};
    278 
    279 	return n;
    280 };
    281 
    282 fn newstrreader(d: *decoder, t: utag) (utf8stream | error) = {
    283 	let strdec: *strdecoder = switch (t) {
    284 	case utag::NUMERIC_STRING =>
    285 		yield &num_decoder;
    286 	case utag::PRINTABLE_STRING =>
    287 		yield &print_decoder;
    288 	case utag::IA5_STRING =>
    289 		yield &ia5_decoder;
    290 	case utag::UTF8_STRING =>
    291 		yield &utf8_decoder;
    292 	case utag::TELETEX_STRING =>
    293 		yield &t61_decoder;
    294 	case utag::BMP_STRING =>
    295 		yield &bmp_decoder;
    296 	case utag::UNIVERSAL_STRING =>
    297 		yield &universal_decoder;
    298 	case =>
    299 		return invalid;
    300 	};
    301 
    302 	return utf8stream {
    303 		stream = &utf8stream_vtable,
    304 		d = d,
    305 		strdec = strdec,
    306 		...
    307 	};
    308 };
    309 
    310 // Returns an [[utf8stream]] for a supported utag 't', which is one of:
    311 //   * utag::NUMERIC_STRING
    312 //   * utag::PRINTABLE_STRING
    313 //   * utag::IA5_STRING
    314 //   * utag::UTF8_STRING
    315 //   * utag::TELETEX_STRING
    316 //   * utag::BMP_STRING
    317 //   * utag::UNIVERSAL_STRING
    318 export fn strreader(d: *decoder, t: utag) (utf8stream | error) = {
    319 	let dh = next(d)?;
    320 	expect_utag(dh, t)?;
    321 	return newstrreader(d, t)!;
    322 };
    323 
    324 // Reads a printable string into 'buf'.
    325 export fn read_printstr(d: *decoder, buf: []u8) (size | error) = {
    326 	let dh = next(d)?;
    327 	expect_utag(dh, utag::PRINTABLE_STRING)?;
    328 
    329 	const n = read_bytes(d, buf)?;
    330 
    331 	for (let i = 0z; i < n; i += 1) {
    332 		if (!c_is_print(buf[i])) {
    333 			return invalid;
    334 		};
    335 	};
    336 	return n;
    337 };
    338 
    339 // Reads an utf8 string into 'buf' and returns a str that borrows from buf.
    340 export fn read_utf8str(d: *decoder, buf: []u8) (str | error) = {
    341 	let dh = next(d)?;
    342 	expect_utag(dh, utag::UTF8_STRING)?;
    343 
    344 	let r = newstrreader(d, utag::UTF8_STRING)!;
    345 	let n = 0z;
    346 
    347 	for (true) {
    348 		n += match (io::read(&r, buf[n..])) {
    349 		case let sz: size =>
    350 			yield sz;
    351 		case io::EOF =>
    352 			break;
    353 		case let e: io::error =>
    354 			return unwrap_err(e);
    355 		};
    356 	};
    357 
    358 	return strings::fromutf8(buf[..n])!;
    359 };