hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

regex.ha (25116B)


      1 // License: MPL-2.0
      2 // (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
      3 use ascii;
      4 use bufio;
      5 use encoding::utf8;
      6 use errors;
      7 use io;
      8 use strconv;
      9 use strings;
     10 
     11 // An error string describing a compilation error.
     12 export type error = !str;
     13 
     14 export type inst_lit = rune,
     15 	inst_charset = struct { idx: size, is_positive: bool },
     16 	inst_any = void,
     17 	inst_split = size,
     18 	inst_jump = size,
     19 	inst_skip = void,
     20 	inst_match = bool,
     21 	inst_groupstart = void,
     22 	inst_groupend = void,
     23 	inst_repeat = struct {
     24 		id: size,
     25 		origin: size,
     26 		min: (void | size),
     27 		max: (void | size),
     28 	};
     29 
     30 export type inst = (inst_lit | inst_any | inst_split | inst_jump |
     31 	inst_skip | inst_match | inst_charset |
     32 	inst_groupstart | inst_groupend |
     33 	inst_repeat);
     34 
     35 // The resulting match of a [[regex]] applied to a string.
     36 //
     37 // The first [[capture]] corresponds to the implicit zeroth capture group,
     38 // i.e. the whole expression.
     39 //
     40 // The rest of the [[capture]]s correspond to the rest of the capture groups,
     41 // i.e. the sub-expressions.
     42 export type result = []capture;
     43 
     44 // A (sub)match corresponding to a regular expression's capture group.
     45 export type capture = struct {
     46 	content: str,
     47 	start: size,
     48 	start_bytesize: size,
     49 	end: size,
     50 	end_bytesize: size
     51 };
     52 
     53 type thread = struct {
     54 	pc: size,
     55 	start_idx: size,
     56 	start_bytesize: size,
     57 	root_capture: capture,
     58 	captures: []capture,
     59 	curr_capture: capture,
     60 	curr_capture_inited: bool,
     61 	rep_counters: []size,
     62 	matched: bool,
     63 	failed: bool,
     64 };
     65 
     66 type newmatch = void;
     67 
     68 export type charclass = enum {
     69 	ALNUM, ALPHA, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT, SPACE,
     70 	UPPER, XDIGIT,
     71 };
     72 export type charset = [](charset_lit_item | charset_range_item |
     73 	charset_class_item),
     74 	charset_lit_item = rune,
     75 	charset_range_item = (u32, u32),
     76 	charset_class_item = *fn(c: rune) bool;
     77 
     78 const charclass_map: [](str, *fn(c: rune) bool) = [
     79 	(":alnum:]", &ascii::isalnum),
     80 	(":alpha:]", &ascii::isalpha),
     81 	(":blank:]", &ascii::isblank),
     82 	(":cntrl:]", &ascii::iscntrl),
     83 	(":digit:]", &ascii::isdigit),
     84 	(":graph:]", &ascii::isgraph),
     85 	(":lower:]", &ascii::islower),
     86 	(":print:]", &ascii::isprint),
     87 	(":punct:]", &ascii::ispunct),
     88 	(":space:]", &ascii::isspace),
     89 	(":upper:]", &ascii::isupper),
     90 	(":xdigit:]", &ascii::isxdigit),
     91 ];
     92 
     93 export type regex = struct {
     94 	insts: []inst,
     95 	charsets: []charset,
     96 	n_reps: size,
     97 };
     98 
     99 // Frees resources associated with a [[regex]].
    100 export fn finish(re: *regex) void = {
    101 	free(re.insts);
    102 	for (let i = 0z; i < len(re.charsets); i += 1) {
    103 		free(re.charsets[i]);
    104 	};
    105 	free(re.charsets);
    106 };
    107 
    108 fn find_last_groupstart(insts: *[]inst) (size | error) = {
    109 	for (let i = len(insts); i > 0; i -= 1) {
    110 		if (insts[i - 1] is inst_groupstart) {
    111 			return i - 1;
    112 		};
    113 	};
    114 	return `Unmatched ')'`: error;
    115 };
    116 
    117 fn handle_bracket(
    118 	insts: *[]inst,
    119 	r: rune,
    120 	r_idx: *size,
    121 	bracket_idx: *int,
    122 	iter: *strings::iterator,
    123 	charsets: *[]charset,
    124 	skip_charclass_rest: *bool,
    125 	is_charset_positive: *bool,
    126 	in_bracket: *bool
    127 ) (void | error) = {
    128 	const peek1 = strings::next(iter);
    129 	const peek2 = strings::next(iter);
    130 	const peek3 = strings::next(iter);
    131 	if (!(peek1 is void)) {
    132 		strings::prev(iter);
    133 	};
    134 	if (!(peek2 is void)) {
    135 		strings::prev(iter);
    136 	};
    137 	if (!(peek3 is void)) {
    138 		strings::prev(iter);
    139 	};
    140 
    141 	if (*bracket_idx == -1) {
    142 		append(charsets, []);
    143 	};
    144 	*bracket_idx += 1;
    145 
    146 	if (*skip_charclass_rest) {
    147 		if (r == ']') {
    148 			*skip_charclass_rest = false;
    149 		};
    150 		*r_idx += 1;
    151 		return;
    152 	};
    153 
    154 	const is_range = peek1 is rune && peek1 as rune == '-'
    155 		&& !(peek2 is void) && !(peek3 is void)
    156 		&& !(peek2 as rune == ']');
    157 	const range_end = peek2;
    158 	const is_first_char = *bracket_idx == 0 || *bracket_idx == 1
    159 		&& !*is_charset_positive;
    160 
    161 	if (r == '\\') {
    162 		if (peek1 is void) {
    163 			return `Trailing backslash '\'`: error;
    164 		} else {
    165 			append(charsets[len(charsets) - 1],
    166 				peek1: charset_lit_item);
    167 			strings::next(iter);
    168 			*r_idx += 1;
    169 		};
    170 	} else if (r == ']' && !is_first_char) {
    171 		const newinst = inst_charset {
    172 			idx = len(charsets) - 1,
    173 			is_positive = *is_charset_positive,
    174 		};
    175 		append(insts, newinst);
    176 		*in_bracket = false;
    177 		*bracket_idx = -1;
    178 		*is_charset_positive = true;
    179 	} else if (r == '^' && *bracket_idx == 0) {
    180 		*is_charset_positive = false;
    181 	} else if (r == '[' && !(peek1 is void)
    182 			&& peek1 as rune == ':') {
    183 		const rest = strings::iterstr(iter);
    184 		const n_cc = len(charclass_map);
    185 		for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
    186 			if (strings::hasprefix(rest, charclass_map[cc_idx].0)) {
    187 				append(charsets[len(charsets) - 1],
    188 					charclass_map[cc_idx].1);
    189 				*skip_charclass_rest = true;
    190 				break;
    191 			};
    192 		};
    193 		if (!*skip_charclass_rest) {
    194 			return `No character class after '[:'`: error;
    195 		};
    196 	} else if (is_range) {
    197 		const start_b = r: u32;
    198 		const end_b = range_end as rune: u32;
    199 
    200 		if (end_b < start_b) {
    201 			return `Decending bracket expression range '[z-a]'`: error;
    202 		};
    203 
    204 		append(charsets[len(charsets) - 1],
    205 			(start_b, end_b): charset_range_item);
    206 		strings::next(iter);
    207 		strings::next(iter);
    208 		*r_idx += 2;
    209 	} else {
    210 		append(charsets[len(charsets) - 1],
    211 			r: charset_lit_item);
    212 	};
    213 
    214 	*r_idx += 1;
    215 };
    216 
    217 // Compiles a regular expression string into a [[regex]].
    218 export fn compile(expr: str) (regex | error) = {
    219 	let insts: []inst = [];
    220 	let charsets: []charset = [];
    221 	let iter = strings::iter(expr);
    222 	let r_idx = 0z;
    223 	let anchored = false;
    224 	let curr_alt_jump_idx = -1;
    225 	let in_bracket = false;
    226 	let skip_charclass_rest = false;
    227 	let bracket_idx = -1;
    228 	let is_charset_positive = true;
    229 	let n_reps = 0z;
    230 	let n_groupstarts = 0;
    231 
    232 	for (true) {
    233 		const next = strings::next(&iter);
    234 
    235 		if (r_idx == 0 && next is rune && next: rune != '^') {
    236 			append(insts, void: inst_skip);
    237 		};
    238 
    239 		if (in_bracket) {
    240 			if (next is void) {
    241 				return `Unmatched '['`: error;
    242 			};
    243 			const r = next: rune;
    244 			handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter,
    245 				&charsets, &skip_charclass_rest,
    246 				&is_charset_positive,
    247 				&in_bracket)?;
    248 			continue;
    249 		};
    250 
    251 		const r = match (next) {
    252 		case void =>
    253 			if (n_groupstarts > 0) {
    254 				return `Unmatched '('`: error;
    255 			};
    256 			break;
    257 		case let r: rune => yield r;
    258 		};
    259 		switch (r) {
    260 		case '\\' =>
    261 			const peek1 = strings::next(&iter);
    262 			if (peek1 is void) {
    263 				return `Trailing backslash '\'`: error;
    264 			} else {
    265 				append(insts, (peek1 as rune): inst_lit);
    266 				r_idx += 1;
    267 			};
    268 		case '^' =>
    269 			if (r_idx != 0) {
    270 				return `Anchor '^' not at start`: error;
    271 			};
    272 		case '$' =>
    273 			if (r_idx != len(expr) - 1) {
    274 				return `Anchor '$' not at end`: error;
    275 			};
    276 			anchored = true;
    277 		case '[' =>
    278 			in_bracket = true;
    279 		case ']' =>
    280 			append(insts, r: inst_lit);
    281 		case '(' =>
    282 			if (n_groupstarts > 0) {
    283 				return `Nested capture groups are unsupported`: error;
    284 			};
    285 			append(insts, void: inst_groupstart);
    286 			n_groupstarts += 1;
    287 		case ')' =>
    288 			if (n_groupstarts == 0) {
    289 				return `Unmatched ')'`: error;
    290 			};
    291 			n_groupstarts -= 1;
    292 			append(insts, void: inst_groupend);
    293 			if (curr_alt_jump_idx != -1) {
    294 				assert(insts[curr_alt_jump_idx] is inst_jump);
    295 				insts[curr_alt_jump_idx] =
    296 					(len(insts) - 1): inst_jump;
    297 				curr_alt_jump_idx = -1;
    298 			};
    299 		case '|' =>
    300 			append(insts, 9999999: inst_jump);
    301 			const origin = find_last_groupstart(&insts)? + 1;
    302 			const newinst = (len(insts) + 1): inst_split;
    303 			insert(insts[origin], newinst);
    304 			curr_alt_jump_idx = (len(insts) - 1): int;
    305 		case '{' =>
    306 			let origin = len(insts) - 1;
    307 			if (insts[origin] is inst_groupend) {
    308 				origin = find_last_groupstart(&insts)?;
    309 			};
    310 			const rest = strings::iterstr(&iter);
    311 			const rep_parts = parse_repetition(rest)?;
    312 			const can_skip = rep_parts.0 == 0;
    313 			const min = if (rep_parts.0 == 0) {
    314 				yield 1z;
    315 			} else {
    316 				yield rep_parts.0;
    317 			};
    318 			if (can_skip) {
    319 				insert(insts[origin],
    320 					len(insts) + 2: inst_split);
    321 				origin += 1;
    322 			};
    323 			const newinst = inst_repeat {
    324 				id = n_reps,
    325 				origin = origin,
    326 				min = min,
    327 				max = rep_parts.1,
    328 			};
    329 			for (let i = 0z; i <= rep_parts.2; i += 1) {
    330 				strings::next(&iter);
    331 				r_idx += 1;
    332 			};
    333 			append(insts, newinst);
    334 			n_reps += 1;
    335 		case '?' =>
    336 			if (r_idx == 0 || len(insts) == 0) {
    337 				return `Unused '?'`: error;
    338 			};
    339 			let term_start_idx = len(insts) - 1;
    340 			match (insts[term_start_idx]) {
    341 			case (inst_lit | inst_charset | inst_any) => void;
    342 			case inst_groupend =>
    343 				term_start_idx = find_last_groupstart(&insts)?;
    344 			case inst_groupstart =>
    345 				return `Unused '?'`: error;
    346 			case =>
    347 				return `Misused '?'`: error;
    348 			};
    349 			const after_idx = len(insts) + 1;
    350 			insert(insts[term_start_idx], after_idx: inst_split);
    351 		case '*' =>
    352 			if (r_idx == 0 || len(insts) == 0) {
    353 				return `Unused '*'`: error;
    354 			};
    355 			const new_inst_offset = 1z;
    356 			const jump_idx = len(insts) + new_inst_offset;
    357 			const after_idx = jump_idx + 1z;
    358 			let term_start_idx = len(insts) - 1z;
    359 			match (insts[term_start_idx]) {
    360 			case (inst_lit | inst_charset | inst_any) => void;
    361 			case inst_groupend =>
    362 				term_start_idx = find_last_groupstart(&insts)?;
    363 			case inst_groupstart =>
    364 				return `Unused '*'`: error;
    365 			case =>
    366 				return `Misused '*'`: error;
    367 			};
    368 			const split_idx = term_start_idx;
    369 			term_start_idx += new_inst_offset;
    370 			insert(insts[split_idx], after_idx: inst_split);
    371 			append(insts, split_idx: inst_jump);
    372 		case '+' =>
    373 			if (r_idx == 0 || len(insts) == 0) {
    374 				return `Unused '+'`: error;
    375 			};
    376 			let term_start_idx = len(insts) - 1;
    377 			match (insts[term_start_idx]) {
    378 			case (inst_lit | inst_charset | inst_any) => void;
    379 			case inst_groupend =>
    380 				term_start_idx = find_last_groupstart(&insts)?;
    381 			case inst_groupstart =>
    382 				return `Unused '+'`: error;
    383 			case =>
    384 				return `Misused '+'`: error;
    385 			};
    386 			append(insts, term_start_idx: inst_split);
    387 		case '.' =>
    388 			append(insts, void: inst_any);
    389 		case =>
    390 			append(insts, r: inst_lit);
    391 		};
    392 		r_idx += 1;
    393 	};
    394 
    395 	append(insts, anchored: inst_match);
    396 
    397 	return regex {
    398 		insts = insts,
    399 		charsets = charsets,
    400 		n_reps = n_reps,
    401 	};
    402 };
    403 
    404 fn parse_repetition(
    405 	s: str
    406 ) (((void | size), (void | size), size) | error) = {
    407 	const first_comma = strings::index(s, ",");
    408 	const first_endbrace = strings::index(s, "}");
    409 	if (first_endbrace is void) {
    410 		return `Repetition expression syntax error '{n}'`: error;
    411 	};
    412 	const first_endbrace = first_endbrace as size;
    413 
    414 	let min_str = "";
    415 	let max_str = "";
    416 	let is_single_arg = false;
    417 	if (first_comma is void || first_endbrace < first_comma as size) {
    418 		const cut = strings::cut(s, "}");
    419 		min_str = cut.0;
    420 		max_str = cut.0;
    421 		is_single_arg = true;
    422 	} else {
    423 		const cut = strings::cut(s, ",");
    424 		min_str = cut.0;
    425 		max_str = strings::cut(cut.1, "}").0;
    426 	};
    427 
    428 	let min: (void | size) = void;
    429 	let max: (void | size) = void;
    430 
    431 	if (len(min_str) > 0) {
    432 		min = match (strconv::stoi(min_str)) {
    433 		case let res: int =>
    434 			yield if (res < 0) {
    435 				return `Negative repitition count '{-n}'`: error;
    436 			} else {
    437 				yield res: size;
    438 			};
    439 		case => return `Repetition expression syntax error '{n}'`: error;
    440 		};
    441 	} else {
    442 		min = 0;
    443 	};
    444 
    445 	if (len(max_str) > 0) {
    446 		max = match (strconv::stoi(max_str)) {
    447 		case let res: int =>
    448 			yield if (res < 0) {
    449 				return `Negative repitition count '{-n}'`: error;
    450 			} else {
    451 				yield res: size;
    452 			};
    453 		case => return `Repetition expression syntax error '{n}'`: error;
    454 		};
    455 	};
    456 
    457 	const rep_len = if (is_single_arg) {
    458 		yield len(min_str);
    459 	} else {
    460 		yield len(min_str) + 1 + len(max_str);
    461 	};
    462 	return (min, max, rep_len);
    463 };
    464 
    465 fn delete_thread(i: size, threads: *[]thread) void = {
    466 	free(threads[i].captures);
    467 	free(threads[i].rep_counters);
    468 	delete(threads[i]);
    469 };
    470 
    471 fn is_consuming_inst(a: inst) bool = {
    472 	return a is (inst_lit | inst_any | inst_charset);
    473 };
    474 
    475 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
    476 	// Do not add this thread if there is already another thread with
    477 	// the same PC
    478 	for (let i = 0z; i < len(threads); i += 1) {
    479 		if (threads[i].pc == new_pc
    480 				&& !threads[i].matched
    481 				&& threads[i].start_idx
    482 				< threads[parent_idx].start_idx) {
    483 			return;
    484 		};
    485 	};
    486 
    487 	append(threads, thread {
    488 		pc = new_pc,
    489 		start_idx = threads[parent_idx].start_idx,
    490 		start_bytesize = threads[parent_idx].start_bytesize,
    491 		curr_capture = threads[parent_idx].curr_capture,
    492 		curr_capture_inited = threads[parent_idx].curr_capture_inited,
    493 		matched = threads[parent_idx].matched,
    494 		failed = threads[parent_idx].failed,
    495 		captures = alloc(threads[parent_idx].captures...),
    496 		rep_counters = alloc(threads[parent_idx].rep_counters...),
    497 		...
    498 	});
    499 };
    500 
    501 fn run_thread(
    502 	i: size,
    503 	re: *regex,
    504 	string: str,
    505 	threads: *[]thread,
    506 	r_or_end: (rune | io::EOF),
    507 	str_idx: size,
    508 	str_bytesize: size
    509 ) (void | newmatch) = {
    510 	const str_bytes = strings::toutf8(string);
    511 	if (threads[i].matched) {
    512 		return;
    513 	};
    514 	for (!is_consuming_inst(re.insts[threads[i].pc])) {
    515 		match (re.insts[threads[i].pc]) {
    516 		case inst_lit => abort();
    517 		case inst_any => abort();
    518 		case inst_split =>
    519 			const new_pc = re.insts[threads[i].pc]: inst_split: size;
    520 			add_thread(threads, i, new_pc);
    521 			threads[i].pc += 1;
    522 		case inst_jump =>
    523 			threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
    524 		case inst_skip =>
    525 			const new_pc = threads[i].pc + 1;
    526 			threads[i].start_idx = str_idx;
    527 			threads[i].start_bytesize = str_bytesize;
    528 			add_thread(threads, i, new_pc);
    529 			break;
    530 		case let anchored: inst_match =>
    531 			// Do not match if we need an end-anchored match, but we
    532 			// have not exhausted our string
    533 			if (anchored && !(r_or_end is io::EOF)) {
    534 				threads[i].failed = true;
    535 				return;
    536 			};
    537 			const content = strings::fromutf8_unsafe(str_bytes[
    538 				threads[i].start_bytesize..str_bytesize]);
    539 			threads[i].root_capture = capture {
    540 				start = threads[i].start_idx,
    541 				start_bytesize = threads[i].start_bytesize,
    542 				end = str_idx,
    543 				end_bytesize = str_bytesize,
    544 				content = content,
    545 			};
    546 			threads[i].matched = true;
    547 			return newmatch;
    548 		case inst_groupstart =>
    549 			assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported");
    550 			threads[i].curr_capture.start = str_idx;
    551 			threads[i].curr_capture.start_bytesize = str_bytesize;
    552 			threads[i].curr_capture_inited = true;
    553 			threads[i].pc += 1;
    554 		case inst_groupend =>
    555 			assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
    556 			threads[i].curr_capture.end = str_idx;
    557 			threads[i].curr_capture.end_bytesize = str_bytesize;
    558 			threads[i].curr_capture.content =
    559 				strings::fromutf8_unsafe(str_bytes[
    560 					threads[i].curr_capture.start_bytesize..
    561 					threads[i].curr_capture.end_bytesize]);
    562 			append(threads[i].captures, threads[i].curr_capture);
    563 			threads[i].curr_capture = capture { ... };
    564 			threads[i].curr_capture_inited = false;
    565 			threads[i].pc += 1;
    566 		case let ir: inst_repeat =>
    567 			assert(ir.id < len(threads[i].rep_counters));
    568 			threads[i].rep_counters[ir.id] += 1;
    569 			if (ir.max is size
    570 					&& threads[i].rep_counters[ir.id]
    571 					> ir.max as size) {
    572 				threads[i].failed = true;
    573 				return;
    574 			};
    575 			const new_pc = threads[i].pc + 1;
    576 			threads[i].pc = ir.origin;
    577 			if (ir.min is void
    578 					|| threads[i].rep_counters[ir.id]
    579 					>= ir.min as size) {
    580 				add_thread(threads, i, new_pc);
    581 			};
    582 		};
    583 	};
    584 
    585 	// From now on, we're only matching consuming instructions, and these
    586 	// can't do anything without another rune.
    587 	if (r_or_end is io::EOF) {
    588 		threads[i].failed = true;
    589 		return;
    590 	};
    591 
    592 	const r = r_or_end as rune;
    593 
    594 	match (re.insts[threads[i].pc]) {
    595 	case inst_skip => return;
    596 	case let lit: inst_lit =>
    597 		if (r != lit) {
    598 			threads[i].failed = true;
    599 		};
    600 	case inst_any => void;
    601 	case let cs: inst_charset =>
    602 		const charset = re.charsets[cs.idx];
    603 		// Disprove the match if we're looking for a negative match
    604 		// Prove the match if we're looking for a positive match
    605 		let matched = !cs.is_positive;
    606 		for (let i = 0z; i < len(charset); i += 1) match (charset[i]) {
    607 		case let lit: charset_lit_item =>
    608 			if (r == lit) {
    609 				// Succeeded if positive match
    610 				// Failed if negative match
    611 				matched = cs.is_positive;
    612 				break;
    613 			};
    614 		case let range: charset_range_item =>
    615 			const r_b = r: u32;
    616 
    617 			if (r_b >= range.0 && r_b <= range.1) {
    618 				// Succeeded if positive match
    619 				// Failed if negative match
    620 				matched = cs.is_positive;
    621 				break;
    622 			};
    623 		case let classfn: charset_class_item =>
    624 			if (classfn(r)) {
    625 				// Succeeded if positive match
    626 				// Failed if negative match
    627 				matched = cs.is_positive;
    628 				break;
    629 			};
    630 		};
    631 		if (!matched) {
    632 			threads[i].failed = true;
    633 		};
    634 	};
    635 
    636 	threads[i].pc += 1;
    637 };
    638 
    639 // Attempts to match a regular expression against a string and returns the
    640 // either the longest leftmost match or all matches.
    641 fn search(
    642 	re: *regex,
    643 	string: str,
    644 	handle: io::handle,
    645 	need_captures: bool
    646 ) (void | []capture) = {
    647 	let threads: []thread = alloc([
    648 		thread { captures = [], ... }
    649 	]);
    650 	if (re.n_reps > 0) {
    651 		threads[0].rep_counters = alloc([0...], re.n_reps);
    652 	};
    653 	defer {
    654 		for (let i = 0z; i < len(threads); i += 1) {
    655 			free(threads[i].captures);
    656 			free(threads[i].rep_counters);
    657 		};
    658 		free(threads);
    659 	};
    660 
    661 	let str_idx = 0z;
    662 	let first_match_idx: (void | size) = void;
    663 	let str_bytesize = 0z;
    664 	let last_bytesize = 0z;
    665 
    666 	for (true) {
    667 		str_bytesize += last_bytesize;
    668 
    669 		if (len(threads) == 0) {
    670 			return void;
    671 		};
    672 
    673 		let all_matched = true;
    674 		for (let i = 0z; i < len(threads); i += 1) {
    675 			if (!threads[i].matched) {
    676 				all_matched = false;
    677 				break;
    678 			};
    679 		};
    680 
    681 		if (all_matched) {
    682 			let best_len = 0z;
    683 			let best_n_captures = 0z;
    684 			let best_idx = 0z;
    685 			for (let i = 0z; i < len(threads); i += 1) {
    686 				let match_len = threads[i].root_capture.end
    687 					- threads[i].root_capture.start;
    688 				const is_better = match_len > best_len
    689 					|| match_len == best_len
    690 					&& len(threads[i].captures)
    691 					> best_n_captures;
    692 				if (is_better) {
    693 					best_len = match_len;
    694 					best_idx = i;
    695 					best_n_captures = len(threads[i].captures);
    696 				};
    697 			};
    698 			let res: []capture = alloc([],
    699 				len(threads[best_idx].captures) + 1);
    700 			append(res, threads[best_idx].root_capture);
    701 			append(res, threads[best_idx].captures...);
    702 			return res;
    703 		};
    704 
    705 		const r_or_end = bufio::scanrune(handle)!;
    706 		if (r_or_end is rune) {
    707 			last_bytesize = utf8::runesz(r_or_end as rune);
    708 		};
    709 
    710 		for (let i = 0z; i < len(threads); i += 1) {
    711 			const res = run_thread(i, re, string, &threads,
    712 				r_or_end, str_idx, str_bytesize);
    713 			const matchlen = threads[i].root_capture.end
    714 				- threads[i].root_capture.start;
    715 			if (res is newmatch && matchlen > 0 && !need_captures) {
    716 				return []: []capture;
    717 			};
    718 			const is_better = res is newmatch && matchlen > 0
    719 				&& (first_match_idx is void
    720 					|| threads[i].start_idx
    721 						< first_match_idx as size);
    722 			if (is_better) {
    723 				first_match_idx = threads[i].start_idx;
    724 			};
    725 		};
    726 		str_idx += 1;
    727 
    728 		// When we only want the leftmost match, delete all threads that
    729 		// start after the earliest non-zero-length matched thread
    730 		if (first_match_idx is size) {
    731 			for (let i = 0z; i < len(threads); i += 1) {
    732 				if (threads[i].start_idx
    733 						> first_match_idx as size) {
    734 					threads[i].failed = true;
    735 				};
    736 			};
    737 		};
    738 
    739 		// Delete threads that have a PC that has already been
    740 		// encountered in previous threads. Prioritise threads that
    741 		// have an earlier start_idx, and threads that were added
    742 		// earlier.
    743 		for (let i = 0i64; i < len(threads): i64 - 1; i += 1) {
    744 			for (let j = i + 1; j < len(threads): i64; j += 1) {
    745 				const same_pc = threads[i].pc == threads[j].pc;
    746 				const none_matched = !threads[j].matched
    747 					&& !threads[i].matched;
    748 				if (same_pc && none_matched) {
    749 					if (threads[i].start_idx
    750 							<= threads[j].start_idx) {
    751 						delete_thread(j: size, &threads);
    752 						j -= 1;
    753 					} else {
    754 						delete_thread(i: size, &threads);
    755 						i -= 1;
    756 						break;
    757 					};
    758 				};
    759 			};
    760 		};
    761 
    762 		for (let i = 0z; i < len(threads); i += 1) {
    763 			if (threads[i].failed) {
    764 				delete_thread(i, &threads);
    765 				i -= 1;
    766 			};
    767 		};
    768 	};
    769 
    770 	return void;
    771 };
    772 
    773 // Returns whether or not a [[regex]] matches any part of a given string.
    774 export fn test(re: *regex, string: str) bool = {
    775 	let strm = bufio::fixed(strings::toutf8(string), io::mode::READ);
    776 	return search(re, string, &strm, false) is []capture;
    777 };
    778 
    779 
    780 // Attempts to match a [[regex]] against a string and returns the longest
    781 // leftmost match as a [[result]]. The caller must free the return value with
    782 // [[result_free]].
    783 export fn find(re: *regex, string: str) result = {
    784 	let strm = bufio::fixed(strings::toutf8(string), io::mode::READ);
    785 	match (search(re, string, &strm, true)) {
    786 	case let m: []capture =>
    787 		return m;
    788 	case void =>
    789 		return [];
    790 	};
    791 };
    792 
    793 // Attempts to match a [[regex]] against a string and returns all
    794 // non-overlapping matches as a slice of [[result]]s. The caller must free the
    795 // return value with [[result_freeall]].
    796 export fn findall(re: *regex, string: str) []result = {
    797 	let res: [][]capture = [];
    798 	let str_idx = 0z, str_bytesize = 0z;
    799 	let substring = string;
    800 	let strm = bufio::fixed(strings::toutf8(string), io::mode::READ);
    801 	const str_bytes = strings::toutf8(string);
    802 	for (true) {
    803 		match (search(re, substring, &strm, true)) {
    804 		case let m: []capture =>
    805 			append(res, m);
    806 			m[0].start += str_idx;
    807 			m[0].end += str_idx;
    808 			m[0].start_bytesize += str_bytesize;
    809 			m[0].end_bytesize += str_bytesize;
    810 			str_idx = m[0].end;
    811 			str_bytesize = m[0].end_bytesize;
    812 			substring = strings::fromutf8(str_bytes[str_bytesize..])!;
    813 			io::seek(&strm, str_bytesize: io::off,
    814 				io::whence::SET)!;
    815 		case void => break;
    816 		};
    817 	};
    818 	return res;
    819 };
    820 
    821 // Replaces all non-overlapping matches of a regular expression against a string
    822 // with 'targetstr'.
    823 //
    824 // A backslash followed by a single decimal number within 'targetstr' is
    825 // replaced by the capture at that index (starting at 1), or an empty string if
    826 // no such capture exists. For example, `\1` is replaced with the first capture,
    827 // `\2` with the second, etc. `\0` is substituted with the entire substring that
    828 // was matched. `\\` is replaced with a literal backslash.
    829 //
    830 // An error is only returned if 'targetstr' isn't formatted correctly.
    831 export fn replace(re: *regex, string: str, targetstr: str) (str | error) = {
    832 	const matches = findall(re, string);
    833 	if (len(matches) == 0) {
    834 		return strings::dup(string);
    835 	};
    836 	defer result_freeall(matches);
    837 
    838 	const target = parse_replace_target(targetstr)?;
    839 	defer free(target);
    840 
    841 	const bytes = strings::toutf8(string);
    842 	let buf = alloc(bytes[..matches[0][0].start_bytesize]...);
    843 
    844 	for (let i = 0z; i < len(matches); i += 1) {
    845 		for (let j = 0z; j < len(target); j += 1) {
    846 			match (target[j]) {
    847 			case let b: []u8 =>
    848 				append(buf, b...);
    849 			case let z: size =>
    850 				if (z >= len(matches[i])) yield;
    851 				const b = strings::toutf8(matches[i][z].content);
    852 				append(buf, b...);
    853 			};
    854 		};
    855 		const start = matches[i][0].end_bytesize;
    856 		const end = if (i == len(matches) - 1) len(bytes)
    857 			else matches[i + 1][0].start_bytesize;
    858 		append(buf, bytes[start..end]...);
    859 	};
    860 
    861 	return strings::fromutf8(buf)!;
    862 };
    863 
    864 fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = {
    865 	const bytes = strings::toutf8(targetstr);
    866 	let target: []([]u8 | size) = alloc([], 1);
    867 	let iter = strings::iter(targetstr);
    868 	let start = 0z, end = 0z;
    869 	for (true) match (strings::next(&iter)) {
    870 	case void =>
    871 		if (start != end) {
    872 			append(target, bytes[start..]);
    873 		};
    874 		break;
    875 	case let r: rune =>
    876 		if (r == '\\') {
    877 			if (start != end) {
    878 				append(target, bytes[start..end]);
    879 			};
    880 
    881 			const r = match (strings::next(&iter)) {
    882 			case void =>
    883 				return "Trailing backslash": error;
    884 			case let r: rune =>
    885 				yield r;
    886 			};
    887 
    888 			if (r == '\\') {
    889 				append(target, '\\');
    890 			} else if (ascii::isdigit(r)) {
    891 				append(target, r: u32: size - 0x30);
    892 			} else {
    893 				return "Backslash must be followed by positive decimal number or a backslash": error;
    894 			};
    895 
    896 			end += 2;
    897 			start = end;
    898 		} else {
    899 			end += utf8::runesz(r);
    900 		};
    901 	};
    902 
    903 	return target;
    904 };
    905 
    906 // Replaces all non-overlapping matches of a regular expression against a string
    907 // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all
    908 // backslashes are treated literally.
    909 export fn rawreplace(re: *regex, string: str, targetstr: str) str = {
    910 	const matches = findall(re, string);
    911 	if (len(matches) == 0) {
    912 		return strings::dup(string);
    913 	};
    914 	defer result_freeall(matches);
    915 
    916 	const target = strings::toutf8(targetstr);
    917 	const bytes = strings::toutf8(string);
    918 	let buf: []u8 = [];
    919 
    920 	append(buf, bytes[..matches[0][0].start_bytesize]...);
    921 	for (let i = 1z; i < len(matches); i += 1) {
    922 		append(buf, target...);
    923 		const start = matches[i - 1][0].end_bytesize;
    924 		const end = matches[i][0].start_bytesize;
    925 		append(buf, bytes[start..end]...);
    926 	};
    927 	append(buf, target...);
    928 	append(buf, bytes[matches[len(matches) - 1][0].end_bytesize..]...);
    929 
    930 	return strings::fromutf8(buf)!;
    931 };
    932 
    933 // Frees a [[result]].
    934 export fn result_free(s: result) void = {
    935 	free(s);
    936 };
    937 
    938 // Frees a slice of [[result]]s.
    939 export fn result_freeall(s: []result) void = {
    940 	for (let i = 0z; i < len(s); i += 1) {
    941 		result_free(s[i]);
    942 	};
    943 	free(s);
    944 };
    945 
    946 // Converts an [[error]] into a user-friendly string.
    947 export fn strerror(err: error) str = err;