hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

regex.ha (22437B)


      1 // License: MPL-2.0
      2 // (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
      3 use ascii;
      4 use encoding::utf8;
      5 use errors;
      6 use strconv;
      7 use strings;
      8 
      9 // A string describing the error the occurred.
     10 export type error = !str;
     11 
     12 export type inst_lit = rune,
     13 	inst_charset = struct { idx: size, is_positive: bool },
     14 	inst_any = void,
     15 	inst_split = size,
     16 	inst_jump = size,
     17 	inst_skip = void,
     18 	inst_match = bool,
     19 	inst_groupstart = void,
     20 	inst_groupend = void,
     21 	inst_repeat = struct {
     22 		id: size,
     23 		origin: size,
     24 		min: (void | size),
     25 		max: (void | size),
     26 	};
     27 
     28 export type inst = (inst_lit | inst_any | inst_split | inst_jump |
     29 	inst_skip | inst_match | inst_charset |
     30 	inst_groupstart | inst_groupend |
     31 	inst_repeat);
     32 
     33 // A (sub)match found as a result of matching a certain string against a regex.
     34 export type capture = struct {
     35 	content: str,
     36 	start: size,
     37 	start_bytesize: size,
     38 	end: size,
     39 	end_bytesize: size
     40 };
     41 
     42 type thread = struct {
     43 	pc: size,
     44 	start_idx: size,
     45 	start_bytesize: size,
     46 	root_capture: capture,
     47 	captures: []capture,
     48 	curr_capture: capture,
     49 	curr_capture_inited: bool,
     50 	rep_counters: []size,
     51 	matched: bool,
     52 	failed: bool,
     53 };
     54 
     55 type newmatch = void;
     56 
     57 export type charclass = enum {
     58 	ALNUM, ALPHA, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT, SPACE,
     59 	UPPER, XDIGIT,
     60 };
     61 export type charset = [](charset_lit_item | charset_range_item |
     62 	charset_class_item),
     63 	charset_lit_item = rune,
     64 	charset_range_item = (u8, u8),
     65 	charset_class_item = *fn(c: rune) bool;
     66 
     67 const charclass_map: [](str, *fn(c: rune) bool) = [
     68 	(":alnum:]", &ascii::isalnum),
     69 	(":alpha:]", &ascii::isalpha),
     70 	(":blank:]", &ascii::isblank),
     71 	(":cntrl:]", &ascii::iscntrl),
     72 	(":digit:]", &ascii::isdigit),
     73 	(":graph:]", &ascii::isgraph),
     74 	(":lower:]", &ascii::islower),
     75 	(":print:]", &ascii::isprint),
     76 	(":punct:]", &ascii::ispunct),
     77 	(":space:]", &ascii::isspace),
     78 	(":upper:]", &ascii::isupper),
     79 	(":xdigit:]", &ascii::isxdigit),
     80 ];
     81 
     82 export type regex = struct {
     83 	insts: []inst,
     84 	charsets: []charset,
     85 	n_reps: size,
     86 };
     87 
     88 // Frees the memory used by a regex.
     89 export fn finish(re: *regex) void = {
     90 	free(re.insts);
     91 	for (let i = 0z; i < len(re.charsets); i += 1) {
     92 		free(re.charsets[i]);
     93 	};
     94 	free(re.charsets);
     95 };
     96 
     97 fn find_last_groupstart(insts: *[]inst) (size | error) = {
     98 	for (let i = len(insts); i > 0; i -= 1) {
     99 		if (insts[i - 1] is inst_groupstart) {
    100 			return i - 1;
    101 		};
    102 	};
    103 	return `Encountered ")" token without matching "("`: error;
    104 };
    105 
    106 fn handle_bracket(
    107 	insts: *[]inst,
    108 	r: rune,
    109 	r_idx: *size,
    110 	bracket_idx: *int,
    111 	iter: *strings::iterator,
    112 	charsets: *[]charset,
    113 	skip_charclass_rest: *bool,
    114 	is_charset_positive: *bool,
    115 	in_bracket: *bool
    116 ) (void | error) = {
    117 	const peek1 = strings::next(iter);
    118 	const peek2 = strings::next(iter);
    119 	const peek3 = strings::next(iter);
    120 	if (!(peek1 is void)) {
    121 		strings::prev(iter);
    122 	};
    123 	if (!(peek2 is void)) {
    124 		strings::prev(iter);
    125 	};
    126 	if (!(peek3 is void)) {
    127 		strings::prev(iter);
    128 	};
    129 
    130 	if (*bracket_idx == -1) {
    131 		append(charsets, []);
    132 	};
    133 	*bracket_idx += 1;
    134 
    135 	if (*skip_charclass_rest) {
    136 		if (r == ']') {
    137 			*skip_charclass_rest = false;
    138 		};
    139 		*r_idx += 1;
    140 		return;
    141 	};
    142 
    143 	const is_range = peek1 is rune && peek1 as rune == '-'
    144 		&& !(peek2 is void) && !(peek3 is void)
    145 		&& !(peek2 as rune == ']');
    146 	const range_end = peek2;
    147 	const is_first_char = *bracket_idx == 0 || *bracket_idx == 1
    148 		&& !*is_charset_positive;
    149 	if (r == ']' && !is_first_char) {
    150 		const newinst = inst_charset {
    151 			idx = len(charsets) - 1,
    152 			is_positive = *is_charset_positive,
    153 		};
    154 		append(insts, newinst);
    155 		*in_bracket = false;
    156 		*bracket_idx = -1;
    157 		*is_charset_positive = true;
    158 	} else if (r == '^' && *bracket_idx == 0) {
    159 		*is_charset_positive = false;
    160 	} else if (r == '[' && !(peek1 is void)
    161 			&& peek1 as rune == ':') {
    162 		const rest = strings::iterstr(iter);
    163 		const n_cc = len(charclass_map);
    164 		for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
    165 			if (strings::hasprefix(rest, charclass_map[cc_idx].0)) {
    166 				append(charsets[len(charsets) - 1],
    167 					charclass_map[cc_idx].1);
    168 				*skip_charclass_rest = true;
    169 				break;
    170 			};
    171 		};
    172 		if (!*skip_charclass_rest) {
    173 			return `Found "[:" in bracket expression and expected a character class such as [:alpha:], but none was found. If you did not mean to use a charclass, try ":["`: error;
    174 		};
    175 	} else if (is_range) {
    176 		const start_enc = utf8::encoderune(r);
    177 		assert(len(start_enc) == 1, "Character ranges do not currently support characters larger than one byte");
    178 		const start_b = start_enc[0];
    179 
    180 		const end_enc = utf8::encoderune(range_end as rune);
    181 		assert(len(end_enc) == 1, "Character ranges do not currently support characters larger than one byte");
    182 		const end_b = end_enc[0];
    183 
    184 		if (end_b < start_b) {
    185 			return `Found range in bracket expression where end character was before start character, e.g. "[b-a]"`: error;
    186 		};
    187 
    188 		append(charsets[len(charsets) - 1],
    189 			(start_b, end_b): charset_range_item);
    190 		strings::next(iter);
    191 		strings::next(iter);
    192 		*r_idx += 2;
    193 	} else {
    194 		append(charsets[len(charsets) - 1],
    195 			r: charset_lit_item);
    196 	};
    197 
    198 	*r_idx += 1;
    199 };
    200 
    201 // Compiles a string containing a regular expression into a regex struct.
    202 export fn compile(expr: str) (regex | error) = {
    203 	let insts: []inst = alloc([]);
    204 	let charsets: []charset = alloc([]);
    205 	let iter = strings::iter(expr);
    206 	let r_idx = 0z;
    207 	let anchored = false;
    208 	let curr_alt_jump_idx = -1;
    209 	let in_bracket = false;
    210 	let skip_charclass_rest = false;
    211 	let bracket_idx = -1;
    212 	let is_charset_positive = true;
    213 	let n_reps = 0z;
    214 	let n_groupstarts = 0;
    215 
    216 	for (true) {
    217 		const next = strings::next(&iter);
    218 
    219 		if (r_idx == 0 && next is rune && next: rune != '^') {
    220 				append(insts, void: inst_skip);
    221 		};
    222 
    223 		if (in_bracket) {
    224 			if (next is void) {
    225 				return `Found unterminated bracket expression, are you missing a closing "]"?`: error;
    226 			};
    227 			const r = next: rune;
    228 			handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter,
    229 				&charsets, &skip_charclass_rest,
    230 				&is_charset_positive,
    231 				&in_bracket)?;
    232 			continue;
    233 		};
    234 
    235 		const r = match (next) {
    236 		case void =>
    237 			if (n_groupstarts > 0) {
    238 				return "Expression ended, but there were still unclosed groups": error;
    239 			};
    240 			break;
    241 		case let r: rune => yield r;
    242 		};
    243 		switch (r) {
    244 		case '\\' =>
    245 			const peek1 = strings::next(&iter);
    246 			if (peek1 is void) {
    247 				return "Found an escaping backslash, but there was nothing to escape": error;
    248 			} else {
    249 				append(insts, (peek1 as rune): inst_lit);
    250 				r_idx += 1;
    251 			};
    252 		case '^' =>
    253 			if (r_idx != 0) {
    254 				return `Anchor character "^" may only occur at the start of the expression`: error;
    255 			};
    256 		case '$' =>
    257 			if (r_idx != len(expr) - 1) {
    258 				return `Anchor character "$" may only occur at the end of the expression`: error;
    259 			};
    260 			anchored = true;
    261 		case '[' =>
    262 			in_bracket = true;
    263 		case ']' =>
    264 			if (in_bracket) {
    265 				in_bracket = false;
    266 			} else {
    267 				append(insts, r: inst_lit);
    268 			};
    269 		case '(' =>
    270 			if (n_groupstarts > 0) {
    271 				return "Found nested capture groups in expression, which are not supported": error;
    272 			};
    273 			append(insts, void: inst_groupstart);
    274 			n_groupstarts += 1;
    275 		case ')' =>
    276 			if (n_groupstarts == 0) {
    277 				return "Tried to close group but none was open": error;
    278 			};
    279 			n_groupstarts -= 1;
    280 			append(insts, void: inst_groupend);
    281 			if (curr_alt_jump_idx != -1) {
    282 				assert(insts[curr_alt_jump_idx] is inst_jump);
    283 				insts[curr_alt_jump_idx] =
    284 					(len(insts) - 1): inst_jump;
    285 				curr_alt_jump_idx = -1;
    286 			};
    287 		case '|' =>
    288 			append(insts, 9999999: inst_jump);
    289 			const origin = find_last_groupstart(&insts)? + 1;
    290 			const newinst = (len(insts) + 1): inst_split;
    291 			insert(insts[origin], newinst);
    292 			curr_alt_jump_idx = (len(insts) - 1): int;
    293 		case '{' =>
    294 			let origin = len(insts) - 1;
    295 			if (insts[origin] is inst_groupend) {
    296 				origin = find_last_groupstart(&insts)?;
    297 			};
    298 			const rest = strings::iterstr(&iter);
    299 			const rep_parts = parse_repetition(rest)?;
    300 			const can_skip = rep_parts.0 == 0;
    301 			const min = if (rep_parts.0 == 0) {
    302 				yield 1z;
    303 			} else {
    304 				yield rep_parts.0;
    305 			};
    306 			if (can_skip) {
    307 				insert(insts[origin],
    308 					len(insts) + 2: inst_split);
    309 				origin += 1;
    310 			};
    311 			const newinst = inst_repeat {
    312 				id = n_reps,
    313 				origin = origin,
    314 				min = min,
    315 				max = rep_parts.1,
    316 			};
    317 			for (let i = 0z; i <= rep_parts.2; i += 1) {
    318 				strings::next(&iter);
    319 				r_idx += 1;
    320 			};
    321 			append(insts, newinst);
    322 			n_reps += 1;
    323 		case '?' =>
    324 			if (r_idx == 0 || len(insts) == 0) {
    325 				return `Found "?" but there was nothing before it`: error;
    326 			};
    327 			let term_start_idx = len(insts) - 1;
    328 			match (insts[term_start_idx]) {
    329 			case (inst_lit | inst_charset | inst_any) => void;
    330 			case inst_groupend =>
    331 				term_start_idx = find_last_groupstart(&insts)?;
    332 			case inst_groupstart =>
    333 				return `Found "?" but it was in an empty group`: error;
    334 			case =>
    335 				return `Invalid use of "?"`: error;
    336 			};
    337 			const after_idx = len(insts) + 1;
    338 			insert(insts[term_start_idx], after_idx: inst_split);
    339 		case '*' =>
    340 			if (r_idx == 0 || len(insts) == 0) {
    341 				return `Found "*" but there was nothing before it`: error;
    342 			};
    343 			const new_inst_offset = 1z;
    344 			const jump_idx = len(insts) + new_inst_offset;
    345 			const after_idx = jump_idx + 1z;
    346 			let term_start_idx = len(insts) - 1z;
    347 			match (insts[term_start_idx]) {
    348 			case (inst_lit | inst_charset | inst_any) => void;
    349 			case inst_groupend =>
    350 				term_start_idx = find_last_groupstart(&insts)?;
    351 			case inst_groupstart =>
    352 				return `Found "*" but it was in an empty group`: error;
    353 			case =>
    354 				return `Invalid use of "*"`: error;
    355 			};
    356 			const split_idx = term_start_idx;
    357 			term_start_idx += new_inst_offset;
    358 			insert(insts[split_idx], after_idx: inst_split);
    359 			append(insts, split_idx: inst_jump);
    360 		case '+' =>
    361 			if (r_idx == 0 || len(insts) == 0) {
    362 				return `Found "+" but there was nothing before it`: error;
    363 			};
    364 			let term_start_idx = len(insts) - 1;
    365 			match (insts[term_start_idx]) {
    366 			case (inst_lit | inst_charset | inst_any) => void;
    367 			case inst_groupend =>
    368 				term_start_idx = find_last_groupstart(&insts)?;
    369 			case inst_groupstart =>
    370 				return `Found "+" but it was in an empty group`: error;
    371 			case =>
    372 				return `Invalid use of "+"`: error;
    373 			};
    374 			append(insts, term_start_idx: inst_split);
    375 		case '.' =>
    376 			append(insts, void: inst_any);
    377 		case =>
    378 			append(insts, r: inst_lit);
    379 		};
    380 		r_idx += 1;
    381 	};
    382 
    383 	append(insts, anchored: inst_match);
    384 
    385 	return regex {
    386 		insts = insts,
    387 		charsets = charsets,
    388 		n_reps = n_reps,
    389 	};
    390 };
    391 
    392 fn parse_repetition(
    393 	s: str
    394 ) (((void | size), (void | size), size) | error) = {
    395 	const first_comma = strings::index(s, ",");
    396 	const first_endbrace = strings::index(s, "}");
    397 	if (first_endbrace is void) {
    398 		return "Invalid repetition value": error;
    399 	};
    400 	const first_endbrace = first_endbrace as size;
    401 
    402 	let min_str = "";
    403 	let max_str = "";
    404 	let is_single_arg = false;
    405 	if (first_comma is void || first_endbrace < first_comma as size) {
    406 		const cut = strings::cut(s, "}");
    407 		min_str = cut.0;
    408 		max_str = cut.0;
    409 		is_single_arg = true;
    410 	} else {
    411 		const cut = strings::cut(s, ",");
    412 		min_str = cut.0;
    413 		max_str = strings::cut(cut.1, "}").0;
    414 	};
    415 
    416 	let min: (void | size) = void;
    417 	let max: (void | size) = void;
    418 
    419 	if (len(min_str) > 0) {
    420 		min = match (strconv::stoi(min_str)) {
    421 		case let res: int =>
    422 			yield if (res < 0) {
    423 				return `Only positive integers are allowed inside "{}"`: error;
    424 			} else {
    425 				yield res: size;
    426 			};
    427 		case => return "Invalid repetition minimum value": error;
    428 		};
    429 	} else {
    430 		min = 0;
    431 	};
    432 
    433 	if (len(max_str) > 0) {
    434 		max = match (strconv::stoi(max_str)) {
    435 		case let res: int =>
    436 			yield if (res < 0) {
    437 				return `Only positive integers are allowed inside "{}"`: error;
    438 			} else {
    439 				yield res: size;
    440 			};
    441 		case => return "Invalid repetition maximum value": error;
    442 		};
    443 	};
    444 
    445 	const rep_len = if (is_single_arg) {
    446 		yield len(min_str);
    447 	} else {
    448 		yield len(min_str) + 1 + len(max_str);
    449 	};
    450 	return (min, max, rep_len);
    451 };
    452 
    453 fn delete_thread(i: size, threads: *[]thread) void = {
    454 	free(threads[i].captures);
    455 	free(threads[i].rep_counters);
    456 	delete(threads[i]);
    457 };
    458 
    459 fn is_consuming_inst(a: inst) bool = {
    460 	return a is (inst_lit | inst_any | inst_charset);
    461 };
    462 
    463 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
    464 	// Do not add this thread if there is already another thread with
    465 	// the same PC
    466 	for (let i = 0z; i < len(threads); i += 1) {
    467 		if (threads[i].pc == new_pc
    468 				&& !threads[i].matched
    469 				&& threads[i].start_idx
    470 				< threads[parent_idx].start_idx) {
    471 			return;
    472 		};
    473 	};
    474 
    475 	append(threads, thread {
    476 		pc = new_pc,
    477 		start_idx = threads[parent_idx].start_idx,
    478 		start_bytesize = threads[parent_idx].start_bytesize,
    479 		curr_capture = threads[parent_idx].curr_capture,
    480 		curr_capture_inited = threads[parent_idx].curr_capture_inited,
    481 		matched = threads[parent_idx].matched,
    482 		failed = threads[parent_idx].failed,
    483 		captures = alloc(threads[parent_idx].captures...),
    484 		rep_counters = alloc(threads[parent_idx].rep_counters...),
    485 		...
    486 	});
    487 };
    488 
    489 fn run_thread(
    490 	i: size,
    491 	re: *regex,
    492 	string: str,
    493 	threads: *[]thread,
    494 	r_or_end: (rune | void),
    495 	str_idx: int,
    496 	str_bytesize: size
    497 ) (void | newmatch) = {
    498 	const str_bytes = strings::toutf8(string);
    499 	if (threads[i].matched) {
    500 		return;
    501 	};
    502 	for (!is_consuming_inst(re.insts[threads[i].pc])) {
    503 		match (re.insts[threads[i].pc]) {
    504 		case inst_lit => abort();
    505 		case inst_any => abort();
    506 		case inst_split =>
    507 			const new_pc = re.insts[threads[i].pc]: inst_split: size;
    508 			add_thread(threads, i, new_pc);
    509 			threads[i].pc += 1;
    510 		case inst_jump =>
    511 			threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
    512 		case inst_skip =>
    513 			const new_pc = threads[i].pc + 1;
    514 			threads[i].start_idx = str_idx: size;
    515 			threads[i].start_bytesize = str_bytesize;
    516 			add_thread(threads, i, new_pc);
    517 			break;
    518 		case let anchored: inst_match =>
    519 			// Do not match if we need an end-anchored match, but we
    520 			// have not exhausted our string
    521 			if (anchored && !(r_or_end is void)) {
    522 				threads[i].failed = true;
    523 				return;
    524 			};
    525 			const content = strings::fromutf8_unsafe(str_bytes[
    526 				threads[i].start_bytesize..str_bytesize]);
    527 			threads[i].root_capture = capture {
    528 				start = threads[i].start_idx,
    529 				start_bytesize = threads[i].start_bytesize,
    530 				end = str_idx: size,
    531 				end_bytesize = str_bytesize,
    532 				content = content,
    533 			};
    534 			threads[i].matched = true;
    535 			return newmatch;
    536 		case inst_groupstart =>
    537 			assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported");
    538 			threads[i].curr_capture.start = str_idx: size;
    539 			threads[i].curr_capture.start_bytesize = str_bytesize;
    540 			threads[i].curr_capture_inited = true;
    541 			threads[i].pc += 1;
    542 		case inst_groupend =>
    543 			assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
    544 			threads[i].curr_capture.end = str_idx: size;
    545 			threads[i].curr_capture.end_bytesize = str_bytesize;
    546 			threads[i].curr_capture.content =
    547 				strings::fromutf8_unsafe(str_bytes[
    548 					threads[i].curr_capture.start_bytesize..
    549 					threads[i].curr_capture.end_bytesize]);
    550 			append(threads[i].captures, threads[i].curr_capture);
    551 			threads[i].curr_capture = capture { ... };
    552 			threads[i].curr_capture_inited = false;
    553 			threads[i].pc += 1;
    554 		case let ir: inst_repeat =>
    555 			assert(ir.id < len(threads[i].rep_counters));
    556 			threads[i].rep_counters[ir.id] += 1;
    557 			if (ir.max is size
    558 					&& threads[i].rep_counters[ir.id]
    559 					> ir.max as size) {
    560 				threads[i].failed = true;
    561 				return;
    562 			};
    563 			const new_pc = threads[i].pc + 1;
    564 			threads[i].pc = ir.origin;
    565 			if (ir.min is void
    566 					|| threads[i].rep_counters[ir.id]
    567 					>= ir.min as size) {
    568 				add_thread(threads, i, new_pc);
    569 			};
    570 		};
    571 	};
    572 
    573 	// From now on, we're only matching consuming instructions, and these
    574 	// can't do anything without another rune.
    575 	if (r_or_end is void) {
    576 		threads[i].failed = true;
    577 		return;
    578 	};
    579 
    580 	const r = r_or_end as rune;
    581 
    582 	match (re.insts[threads[i].pc]) {
    583 	case inst_skip => return;
    584 	case let lit: inst_lit =>
    585 		if (r != lit) {
    586 			threads[i].failed = true;
    587 		};
    588 	case inst_any => void;
    589 	case let cs: inst_charset =>
    590 		const charset = re.charsets[cs.idx];
    591 		// Disprove the match if we're looking for a negative match
    592 		// Prove the match if we're looking for a positive match
    593 		let matched = !cs.is_positive;
    594 		for (let i = 0z; i < len(charset); i += 1) match (charset[i]) {
    595 		case let lit: charset_lit_item =>
    596 			if (r == lit) {
    597 				// Succeeded if positive match
    598 				// Failed if negative match
    599 				matched = cs.is_positive;
    600 				break;
    601 			};
    602 		case let range: charset_range_item =>
    603 			const r_enc = utf8::encoderune(r);
    604 			assert(len(r_enc) == 1, "Character ranges do not currently support characters larger than one byte");
    605 			const r_b = r_enc[0];
    606 			if (r_b >= range.0 && r_b <= range.1) {
    607 				// Succeeded if positive match
    608 				// Failed if negative match
    609 				matched = cs.is_positive;
    610 				break;
    611 			};
    612 		case let classfn: charset_class_item =>
    613 			if (classfn(r)) {
    614 				// Succeeded if positive match
    615 				// Failed if negative match
    616 				matched = cs.is_positive;
    617 				break;
    618 			};
    619 		};
    620 		if (!matched) {
    621 			threads[i].failed = true;
    622 		};
    623 	};
    624 
    625 	threads[i].pc += 1;
    626 };
    627 
    628 // Attempts to match a regular expression against a string and returns the
    629 // either the longest leftmost match or all matches.
    630 fn search(
    631 	re: *regex,
    632 	string: str,
    633 	str_iter: *strings::iterator,
    634 	str_idx: *int,
    635 	str_bytesize: *size,
    636 	need_captures: bool
    637 ) (void | []capture) = {
    638 	let threads: []thread = alloc([
    639 		thread { captures = alloc([]), ... }
    640 	]);
    641 	if (re.n_reps > 0) {
    642 		threads[0].rep_counters = alloc([0...], re.n_reps);
    643 	};
    644 	defer {
    645 		for (let i = 0z; i < len(threads); i += 1) {
    646 			free(threads[i].captures);
    647 			free(threads[i].rep_counters);
    648 		};
    649 		free(threads);
    650 	};
    651 
    652 	let first_match_idx: (void | size) = void;
    653 	let last_bytesize = 0z;
    654 
    655 	for (true) {
    656 		*str_bytesize += last_bytesize;
    657 
    658 		if (len(threads) == 0) {
    659 			return void;
    660 		};
    661 
    662 		let all_matched = true;
    663 		for (let i = 0z; i < len(threads); i += 1) {
    664 			if (!threads[i].matched) {
    665 				all_matched = false;
    666 				break;
    667 			};
    668 		};
    669 
    670 		if (all_matched) {
    671 			let best_len = 0z;
    672 			let best_n_captures = 0z;
    673 			let best_idx = 0z;
    674 			for (let i = 0z; i < len(threads); i += 1) {
    675 				let match_len = threads[i].root_capture.end
    676 					- threads[i].root_capture.start;
    677 				const is_better = match_len > best_len
    678 					|| match_len == best_len
    679 					&& len(threads[i].captures)
    680 					> best_n_captures;
    681 				if (is_better) {
    682 					best_len = match_len;
    683 					best_idx = i;
    684 					best_n_captures = len(threads[i].captures);
    685 				};
    686 			};
    687 			let res: []capture = alloc([],
    688 				len(threads[best_idx].captures) + 1);
    689 			append(res, threads[best_idx].root_capture);
    690 			append(res, threads[best_idx].captures...);
    691 			return res;
    692 		};
    693 
    694 		const r_or_end = strings::next(str_iter);
    695 		*str_idx += 1;
    696 		if (r_or_end is rune) {
    697 			last_bytesize = utf8::runesz(r_or_end as rune);
    698 		};
    699 
    700 		for (let i = 0z; i < len(threads); i += 1) {
    701 			const res = run_thread(i, re, string, &threads,
    702 				r_or_end, *str_idx, *str_bytesize);
    703 			const matchlen = threads[i].root_capture.end
    704 				- threads[i].root_capture.start;
    705 			if (res is newmatch && matchlen > 0 && !need_captures) {
    706 				return []: []capture;
    707 			};
    708 			const is_better = res is newmatch && matchlen > 0
    709 				&& (first_match_idx is void
    710 					|| threads[i].start_idx
    711 						< first_match_idx as size);
    712 			if (is_better) {
    713 				first_match_idx = threads[i].start_idx;
    714 			};
    715 		};
    716 
    717 		// When we only want the leftmost match, delete all threads that
    718 		// start after the earliest non-zero-length matched thread
    719 		if (first_match_idx is size) {
    720 			for (let i = 0z; i < len(threads); i += 1) {
    721 				if (threads[i].start_idx
    722 						> first_match_idx as size) {
    723 					threads[i].failed = true;
    724 				};
    725 			};
    726 		};
    727 
    728 		// Delete threads that have a PC that has already been
    729 		// encountered in previous threads. Prioritise threads that
    730 		// have an earlier start_idx, and threads that were added
    731 		// earlier.
    732 		for (let i = 0i64; i < len(threads): i64 - 1; i += 1) {
    733 			for (let j = i + 1; j < len(threads): i64; j += 1) {
    734 				const same_pc = threads[i].pc == threads[j].pc;
    735 				const none_matched = !threads[j].matched
    736 					&& !threads[i].matched;
    737 				if (same_pc && none_matched) {
    738 					if (threads[i].start_idx
    739 							<= threads[j].start_idx) {
    740 						delete_thread(j: size, &threads);
    741 						j -= 1;
    742 					} else {
    743 						delete_thread(i: size, &threads);
    744 						i -= 1;
    745 						break;
    746 					};
    747 				};
    748 			};
    749 		};
    750 
    751 		for (let i = 0z; i < len(threads); i += 1) {
    752 			if (threads[i].failed) {
    753 				delete_thread(i, &threads);
    754 				i -= 1;
    755 			};
    756 		};
    757 	};
    758 
    759 	return void;
    760 };
    761 
    762 // Returns whether or not a regex matches a string.
    763 export fn test(re: *regex, string: str) bool = {
    764 	let str_idx = -1;
    765 	let str_iter = strings::iter(string);
    766 	let str_bytesize = 0z;
    767 	match (search(re, string, &str_iter, &str_idx, &str_bytesize, false)) {
    768 	case void => return false;
    769 	case []capture => return true;
    770 	};
    771 };
    772 
    773 
    774 // Attempts to match a regular expression against a string and returns the
    775 // longest leftmost match, or void if there is no match.
    776 export fn find(re: *regex, string: str) (void | []capture) = {
    777 	let str_idx = -1;
    778 	let str_iter = strings::iter(string);
    779 	let str_bytesize = 0z;
    780 	return search(re, string, &str_iter, &str_idx, &str_bytesize, true);
    781 };
    782 
    783 // Attempts to match a regular expression against a string and returns all
    784 // non-overlapping matches, or void if there are no matches.
    785 export fn findall(re: *regex, string: str) (void | [][]capture) = {
    786 	let res: [][]capture = alloc([]);
    787 	let str_idx = -1;
    788 	let str_iter = strings::iter(string);
    789 	let str_bytesize = 0z;
    790 	for (true) {
    791 		const findres = search(re, string, &str_iter, &str_idx,
    792 			&str_bytesize, true);
    793 		match (findres) {
    794 		case let m: []capture =>
    795 			append(res, m);
    796 			assert(str_idx: size >= m[0].end);
    797 			for (str_idx: size > m[0].end) {
    798 				strings::prev(&str_iter);
    799 				str_idx -= 1;
    800 			};
    801 			if (str_idx: size >= len(string)) {
    802 				break;
    803 			};
    804 		case void => break;
    805 		};
    806 	};
    807 	if (len(res) == 0) {
    808 		return void;
    809 	};
    810 	return res;
    811 };
    812 
    813 // Frees a slice of captures.
    814 export fn free_captures(s: []capture) void = {
    815 	free(s);
    816 };
    817 
    818 // Frees each match in a slice of matches, as well as the slice itself.
    819 export fn free_matches(s: [][]capture) void = {
    820 	for (let i = 0z; i < len(s); i += 1) {
    821 		free_captures(s[i]);
    822 	};
    823 	free(s);
    824 };
    825 
    826 // Converts regex [[error]] into a user-friendly string.
    827 export fn strerror(err: error) str = err;