hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

regex.ha (26616B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use ascii;
      5 use bufio;
      6 use encoding::utf8;
      7 use io;
      8 use memio;
      9 use strconv;
     10 use strings;
     11 use types;
     12 
     13 // An error string describing a compilation error.
     14 export type error = !str;
     15 
     16 export type inst_lit = rune,
     17 	inst_charset = struct { idx: size, is_positive: bool },
     18 	inst_any = void,
     19 	inst_split = size,
     20 	inst_jump = size,
     21 	inst_skip = void,
     22 	inst_match = bool,
     23 	inst_groupstart = void,
     24 	inst_groupend = void,
     25 	inst_repeat = struct {
     26 		id: size,
     27 		origin: size,
     28 		min: (void | size),
     29 		max: (void | size),
     30 	};
     31 
     32 export type inst = (inst_lit | inst_any | inst_split | inst_jump |
     33 	inst_skip | inst_match | inst_charset |
     34 	inst_groupstart | inst_groupend |
     35 	inst_repeat);
     36 
     37 // The resulting match of a [[regex]] applied to a string.
     38 //
     39 // The first [[capture]] corresponds to the implicit zeroth capture group,
     40 // i.e. the whole expression.
     41 //
     42 // The rest of the [[capture]]s correspond to the rest of the capture groups,
     43 // i.e. the sub-expressions.
     44 export type result = []capture;
     45 
     46 // A (sub)match corresponding to a regular expression's capture group.
     47 export type capture = struct {
     48 	content: str,
     49 	start: size,
     50 	start_bytesize: size,
     51 	end: size,
     52 	end_bytesize: size
     53 };
     54 
     55 type thread = struct {
     56 	pc: size,
     57 	start_idx: size,
     58 	start_bytesize: size,
     59 	root_capture: capture,
     60 	captures: []capture,
     61 	curr_capture: capture,
     62 	curr_capture_inited: bool,
     63 	rep_counters: []size,
     64 	matched: bool,
     65 	failed: bool,
     66 };
     67 
     68 type newmatch = void;
     69 
     70 export type charclass = enum {
     71 	ALNUM, ALPHA, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT, SPACE,
     72 	UPPER, XDIGIT,
     73 };
     74 export type charset = [](charset_lit_item | charset_range_item |
     75 	charset_class_item),
     76 	charset_lit_item = rune,
     77 	charset_range_item = (u32, u32),
     78 	charset_class_item = *fn(c: rune) bool;
     79 
     80 const charclass_map: [](str, *fn(c: rune) bool) = [
     81 	(":alnum:]", &ascii::isalnum),
     82 	(":alpha:]", &ascii::isalpha),
     83 	(":blank:]", &ascii::isblank),
     84 	(":cntrl:]", &ascii::iscntrl),
     85 	(":digit:]", &ascii::isdigit),
     86 	(":graph:]", &ascii::isgraph),
     87 	(":lower:]", &ascii::islower),
     88 	(":print:]", &ascii::isprint),
     89 	(":punct:]", &ascii::ispunct),
     90 	(":space:]", &ascii::isspace),
     91 	(":upper:]", &ascii::isupper),
     92 	(":xdigit:]", &ascii::isxdigit),
     93 ];
     94 
     95 export type regex = struct {
     96 	insts: []inst,
     97 	charsets: []charset,
     98 	n_reps: size,
     99 };
    100 
    101 // Frees resources associated with a [[regex]].
    102 export fn finish(re: *regex) void = {
    103 	free(re.insts);
    104 	for (let charset .. re.charsets) {
    105 		free(charset);
    106 	};
    107 	free(re.charsets);
    108 };
    109 
    110 fn find_last_groupstart(insts: *[]inst) (size | error) = {
    111 	for (let i = len(insts); i > 0; i -= 1) {
    112 		if (insts[i - 1] is inst_groupstart) {
    113 			return i - 1;
    114 		};
    115 	};
    116 	return `Unmatched ')'`: error;
    117 };
    118 
    119 fn handle_bracket(
    120 	insts: *[]inst,
    121 	r: rune,
    122 	r_idx: *size,
    123 	bracket_idx: *int,
    124 	iter: *strings::iterator,
    125 	charsets: *[]charset,
    126 	skip_charclass_rest: *bool,
    127 	is_charset_positive: *bool,
    128 	in_bracket: *bool
    129 ) (void | error) = {
    130 	const peek1 = strings::next(iter);
    131 	const peek2 = strings::next(iter);
    132 	const peek3 = strings::next(iter);
    133 	if (!(peek1 is done)) {
    134 		strings::prev(iter);
    135 	};
    136 	if (!(peek2 is done)) {
    137 		strings::prev(iter);
    138 	};
    139 	if (!(peek3 is done)) {
    140 		strings::prev(iter);
    141 	};
    142 
    143 	if (*bracket_idx == -1) {
    144 		append(charsets, []);
    145 	};
    146 	*bracket_idx += 1;
    147 
    148 	if (*skip_charclass_rest) {
    149 		if (r == ']') {
    150 			*skip_charclass_rest = false;
    151 		};
    152 		*r_idx += 1;
    153 		return;
    154 	};
    155 
    156 	const is_range = peek1 is rune && peek1 as rune == '-'
    157 		&& !(peek2 is done) && !(peek3 is done)
    158 		&& !(peek2 as rune == ']');
    159 	const range_end = peek2;
    160 	const is_first_char = *bracket_idx == 0 || *bracket_idx == 1
    161 		&& !*is_charset_positive;
    162 
    163 	if (r == '\\') {
    164 		if (peek1 is done) {
    165 			return `Trailing backslash '\'`: error;
    166 		} else {
    167 			append(charsets[len(charsets) - 1],
    168 				peek1: charset_lit_item);
    169 			strings::next(iter);
    170 			*r_idx += 1;
    171 		};
    172 	} else if (r == ']' && !is_first_char) {
    173 		const newinst = inst_charset {
    174 			idx = len(charsets) - 1,
    175 			is_positive = *is_charset_positive,
    176 		};
    177 		append(insts, newinst);
    178 		*in_bracket = false;
    179 		*bracket_idx = -1;
    180 		*is_charset_positive = true;
    181 	} else if (r == '^' && *bracket_idx == 0) {
    182 		*is_charset_positive = false;
    183 	} else if (r == '[' && !(peek1 is done)
    184 			&& peek1 as rune == ':') {
    185 		const rest = strings::iterstr(iter);
    186 		const n_cc = len(charclass_map);
    187 		for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
    188 			if (strings::hasprefix(rest, charclass_map[cc_idx].0)) {
    189 				append(charsets[len(charsets) - 1],
    190 					charclass_map[cc_idx].1);
    191 				*skip_charclass_rest = true;
    192 				break;
    193 			};
    194 		};
    195 		if (!*skip_charclass_rest) {
    196 			return `No character class after '[:'`: error;
    197 		};
    198 	} else if (is_range) {
    199 		const start_b = r: u32;
    200 		const end_b = range_end as rune: u32;
    201 
    202 		if (end_b < start_b) {
    203 			return `Descending bracket expression range '[z-a]'`: error;
    204 		};
    205 
    206 		append(charsets[len(charsets) - 1],
    207 			(start_b, end_b): charset_range_item);
    208 		strings::next(iter);
    209 		strings::next(iter);
    210 		*r_idx += 2;
    211 	} else {
    212 		append(charsets[len(charsets) - 1],
    213 			r: charset_lit_item);
    214 	};
    215 
    216 	*r_idx += 1;
    217 };
    218 
    219 // Compiles a regular expression string into a [[regex]].
    220 export fn compile(expr: str) (regex | error) = {
    221 	let insts: []inst = [];
    222 	let charsets: []charset = [];
    223 	let iter = strings::iter(expr);
    224 	let r_idx = 0z;
    225 	let anchored = false;
    226 	let jump_idxs: []size = [];
    227 	let in_bracket = false;
    228 	let skip_charclass_rest = false;
    229 	let bracket_idx = -1;
    230 	let is_charset_positive = true;
    231 	let n_reps = 0z;
    232 	let n_groupstarts = 0;
    233 
    234 	for (true) {
    235 		const next = strings::next(&iter);
    236 
    237 		if (r_idx == 0 && next is rune && next: rune != '^') {
    238 			append(insts, inst_skip);
    239 		};
    240 
    241 		if (in_bracket) {
    242 			if (next is done) {
    243 				return `Unmatched '['`: error;
    244 			};
    245 			const r = next: rune;
    246 			handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter,
    247 				&charsets, &skip_charclass_rest,
    248 				&is_charset_positive,
    249 				&in_bracket)?;
    250 			continue;
    251 		};
    252 
    253 		const r = match (next) {
    254 		case done =>
    255 			if (n_groupstarts > 0) {
    256 				return `Unmatched '('`: error;
    257 			};
    258 			break;
    259 		case let r: rune => yield r;
    260 		};
    261 		switch (r) {
    262 		case '\\' =>
    263 			const peek1 = strings::next(&iter);
    264 			if (peek1 is done) {
    265 				return `Trailing backslash '\'`: error;
    266 			} else {
    267 				append(insts, (peek1 as rune): inst_lit);
    268 				r_idx += 1;
    269 			};
    270 		case '^' =>
    271 			if (r_idx != 0) {
    272 				return `Anchor '^' not at start`: error;
    273 			};
    274 		case '$' =>
    275 			if (r_idx != len(expr) - 1) {
    276 				return `Anchor '$' not at end`: error;
    277 			};
    278 			anchored = true;
    279 		case '[' =>
    280 			in_bracket = true;
    281 		case ']' =>
    282 			append(insts, r: inst_lit);
    283 		case '(' =>
    284 			if (n_groupstarts > 0) {
    285 				return `Nested capture groups are unsupported`: error;
    286 			};
    287 			append(insts, inst_groupstart);
    288 			n_groupstarts += 1;
    289 		case ')' =>
    290 			if (n_groupstarts == 0) {
    291 				return `Unmatched ')'`: error;
    292 			};
    293 			n_groupstarts -= 1;
    294 			append(insts, inst_groupend);
    295 			for (let jump_idx .. jump_idxs) {
    296 				assert(insts[jump_idx] is inst_jump);
    297 				insts[jump_idx] = (len(insts) - 1): inst_jump;
    298 			};
    299 			jump_idxs = [];
    300 		case '|' =>
    301 			append(insts, types::SIZE_MAX: inst_jump);
    302 			const origin = match (find_last_groupstart(&insts)) {
    303 			case error =>
    304 				yield 0z;
    305 			case let sz: size =>
    306 				yield sz + 1;
    307 			};
    308 			const newinst = (len(insts) + 1): inst_split;
    309 			// add split after last jump (if any) or at origin
    310 			const split_idx = if (len(jump_idxs) > 0)
    311 				jump_idxs[len(jump_idxs) - 1] + 1 else origin;
    312 			insert(insts[split_idx], newinst);
    313 			append(jump_idxs, len(insts) - 1);
    314 		case '{' =>
    315 			let origin = len(insts) - 1;
    316 			if (insts[origin] is inst_groupend) {
    317 				origin = find_last_groupstart(&insts)?;
    318 			};
    319 			const rest = strings::iterstr(&iter);
    320 			const rep_parts = parse_repetition(rest)?;
    321 			const can_skip = rep_parts.0 == 0;
    322 			const min = if (rep_parts.0 == 0) {
    323 				yield 1z;
    324 			} else {
    325 				yield rep_parts.0;
    326 			};
    327 			if (can_skip) {
    328 				insert(insts[origin],
    329 					len(insts) + 2: inst_split);
    330 				origin += 1;
    331 			};
    332 			const newinst = inst_repeat {
    333 				id = n_reps,
    334 				origin = origin,
    335 				min = min,
    336 				max = rep_parts.1,
    337 			};
    338 			for (let i = 0z; i <= rep_parts.2; i += 1) {
    339 				strings::next(&iter);
    340 				r_idx += 1;
    341 			};
    342 			append(insts, newinst);
    343 			n_reps += 1;
    344 		case '?' =>
    345 			if (r_idx == 0 || len(insts) == 0) {
    346 				return `Unused '?'`: error;
    347 			};
    348 			let term_start_idx = len(insts) - 1;
    349 			match (insts[term_start_idx]) {
    350 			case (inst_lit | inst_charset | inst_any) => void;
    351 			case inst_groupend =>
    352 				term_start_idx = find_last_groupstart(&insts)?;
    353 			case inst_groupstart =>
    354 				return `Unused '?'`: error;
    355 			case =>
    356 				return `Misused '?'`: error;
    357 			};
    358 			const after_idx = len(insts) + 1;
    359 			insert(insts[term_start_idx], after_idx: inst_split);
    360 		case '*' =>
    361 			if (r_idx == 0 || len(insts) == 0) {
    362 				return `Unused '*'`: error;
    363 			};
    364 			const new_inst_offset = 1z;
    365 			const jump_idx = len(insts) + new_inst_offset;
    366 			const after_idx = jump_idx + 1z;
    367 			let term_start_idx = len(insts) - 1z;
    368 			match (insts[term_start_idx]) {
    369 			case (inst_lit | inst_charset | inst_any) => void;
    370 			case inst_groupend =>
    371 				term_start_idx = find_last_groupstart(&insts)?;
    372 			case inst_groupstart =>
    373 				return `Unused '*'`: error;
    374 			case =>
    375 				return `Misused '*'`: error;
    376 			};
    377 			const split_idx = term_start_idx;
    378 			term_start_idx += new_inst_offset;
    379 			insert(insts[split_idx], after_idx: inst_split);
    380 			append(insts, split_idx: inst_jump);
    381 		case '+' =>
    382 			if (r_idx == 0 || len(insts) == 0) {
    383 				return `Unused '+'`: error;
    384 			};
    385 			let term_start_idx = len(insts) - 1;
    386 			match (insts[term_start_idx]) {
    387 			case (inst_lit | inst_charset | inst_any) => void;
    388 			case inst_groupend =>
    389 				term_start_idx = find_last_groupstart(&insts)?;
    390 			case inst_groupstart =>
    391 				return `Unused '+'`: error;
    392 			case =>
    393 				return `Misused '+'`: error;
    394 			};
    395 			append(insts, term_start_idx: inst_split);
    396 		case '.' =>
    397 			append(insts, inst_any);
    398 		case =>
    399 			append(insts, r: inst_lit);
    400 		};
    401 		r_idx += 1;
    402 	};
    403 
    404 	// handle whole expression alternation
    405 	for (let jump_idx .. jump_idxs) {
    406 		assert(insts[jump_idx] is inst_jump);
    407 		insts[jump_idx] = len(insts): inst_jump;
    408 	};
    409 	jump_idxs = [];
    410 
    411 	append(insts, anchored: inst_match);
    412 
    413 	return regex {
    414 		insts = insts,
    415 		charsets = charsets,
    416 		n_reps = n_reps,
    417 	};
    418 };
    419 
    420 fn parse_repetition(
    421 	s: str
    422 ) (((void | size), (void | size), size) | error) = {
    423 	const first_comma = strings::index(s, ",");
    424 	const first_endbrace = strings::index(s, "}");
    425 	if (first_endbrace is void) {
    426 		return `Repetition expression syntax error '{n}'`: error;
    427 	};
    428 	const first_endbrace = first_endbrace as size;
    429 
    430 	let min_str = "";
    431 	let max_str = "";
    432 	let is_single_arg = false;
    433 	if (first_comma is void || first_endbrace < first_comma as size) {
    434 		const cut = strings::cut(s, "}");
    435 		min_str = cut.0;
    436 		max_str = cut.0;
    437 		is_single_arg = true;
    438 	} else {
    439 		const cut = strings::cut(s, ",");
    440 		min_str = cut.0;
    441 		max_str = strings::cut(cut.1, "}").0;
    442 	};
    443 
    444 	let min: (void | size) = void;
    445 	let max: (void | size) = void;
    446 
    447 	if (len(min_str) > 0) {
    448 		min = match (strconv::stoi(min_str)) {
    449 		case let res: int =>
    450 			yield if (res < 0) {
    451 				return `Negative repetition count '{-n}'`: error;
    452 			} else {
    453 				yield res: size;
    454 			};
    455 		case => return `Repetition expression syntax error '{n}'`: error;
    456 		};
    457 	} else {
    458 		min = 0;
    459 	};
    460 
    461 	if (len(max_str) > 0) {
    462 		max = match (strconv::stoi(max_str)) {
    463 		case let res: int =>
    464 			yield if (res < 0) {
    465 				return `Negative repetition count '{-n}'`: error;
    466 			} else {
    467 				yield res: size;
    468 			};
    469 		case => return `Repetition expression syntax error '{n}'`: error;
    470 		};
    471 	};
    472 
    473 	const rep_len = if (is_single_arg) {
    474 		yield len(min_str);
    475 	} else {
    476 		yield len(min_str) + 1 + len(max_str);
    477 	};
    478 	return (min, max, rep_len);
    479 };
    480 
    481 fn delete_thread(i: size, threads: *[]thread) void = {
    482 	free(threads[i].captures);
    483 	free(threads[i].rep_counters);
    484 	delete(threads[i]);
    485 };
    486 
    487 fn is_consuming_inst(a: inst) bool = {
    488 	return a is (inst_lit | inst_any | inst_charset);
    489 };
    490 
    491 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
    492 	// Do not add this thread if there is already another thread with
    493 	// the same PC
    494 	for (let thread &.. *threads) {
    495 		if (thread.pc == new_pc	&& !thread.matched
    496 				&& thread.start_idx
    497 				< threads[parent_idx].start_idx) {
    498 			return;
    499 		};
    500 	};
    501 
    502 	append(threads, thread {
    503 		pc = new_pc,
    504 		start_idx = threads[parent_idx].start_idx,
    505 		start_bytesize = threads[parent_idx].start_bytesize,
    506 		curr_capture = threads[parent_idx].curr_capture,
    507 		curr_capture_inited = threads[parent_idx].curr_capture_inited,
    508 		matched = threads[parent_idx].matched,
    509 		failed = threads[parent_idx].failed,
    510 		captures = alloc(threads[parent_idx].captures...),
    511 		rep_counters = alloc(threads[parent_idx].rep_counters...),
    512 		...
    513 	});
    514 };
    515 
    516 fn run_thread(
    517 	i: size,
    518 	re: *regex,
    519 	string: str,
    520 	threads: *[]thread,
    521 	r_or_end: (rune | io::EOF),
    522 	str_idx: size,
    523 	str_bytesize: size
    524 ) (void | newmatch) = {
    525 	const str_bytes = strings::toutf8(string);
    526 	if (threads[i].matched) {
    527 		return;
    528 	};
    529 	for (!is_consuming_inst(re.insts[threads[i].pc])) {
    530 		match (re.insts[threads[i].pc]) {
    531 		case inst_lit => abort();
    532 		case inst_any => abort();
    533 		case inst_split =>
    534 			const new_pc = re.insts[threads[i].pc]: inst_split: size;
    535 			add_thread(threads, i, new_pc);
    536 			threads[i].pc += 1;
    537 		case inst_jump =>
    538 			threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
    539 		case inst_skip =>
    540 			const new_pc = threads[i].pc + 1;
    541 			threads[i].start_idx = str_idx;
    542 			threads[i].start_bytesize = str_bytesize;
    543 			add_thread(threads, i, new_pc);
    544 			break;
    545 		case let anchored: inst_match =>
    546 			// Do not match if we need an end-anchored match, but we
    547 			// have not exhausted our string
    548 			if (anchored && !(r_or_end is io::EOF)) {
    549 				threads[i].failed = true;
    550 				return;
    551 			};
    552 			const content = strings::fromutf8_unsafe(str_bytes[
    553 				threads[i].start_bytesize..str_bytesize]);
    554 			threads[i].root_capture = capture {
    555 				start = threads[i].start_idx,
    556 				start_bytesize = threads[i].start_bytesize,
    557 				end = str_idx,
    558 				end_bytesize = str_bytesize,
    559 				content = content,
    560 			};
    561 			threads[i].matched = true;
    562 			return newmatch;
    563 		case inst_groupstart =>
    564 			assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported");
    565 			threads[i].curr_capture.start = str_idx;
    566 			threads[i].curr_capture.start_bytesize = str_bytesize;
    567 			threads[i].curr_capture_inited = true;
    568 			threads[i].pc += 1;
    569 		case inst_groupend =>
    570 			assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
    571 			threads[i].curr_capture.end = str_idx;
    572 			threads[i].curr_capture.end_bytesize = str_bytesize;
    573 			threads[i].curr_capture.content =
    574 				strings::fromutf8_unsafe(str_bytes[
    575 					threads[i].curr_capture.start_bytesize..
    576 					threads[i].curr_capture.end_bytesize]);
    577 			append(threads[i].captures, threads[i].curr_capture);
    578 			threads[i].curr_capture = capture { ... };
    579 			threads[i].curr_capture_inited = false;
    580 			threads[i].pc += 1;
    581 		case let ir: inst_repeat =>
    582 			assert(ir.id < len(threads[i].rep_counters));
    583 			threads[i].rep_counters[ir.id] += 1;
    584 			if (ir.max is size
    585 					&& threads[i].rep_counters[ir.id]
    586 					> ir.max as size) {
    587 				threads[i].failed = true;
    588 				return;
    589 			};
    590 			const new_pc = threads[i].pc + 1;
    591 			threads[i].pc = ir.origin;
    592 			if (ir.min is void
    593 					|| threads[i].rep_counters[ir.id]
    594 					>= ir.min as size) {
    595 				add_thread(threads, i, new_pc);
    596 			};
    597 		};
    598 	};
    599 
    600 	// From now on, we're only matching consuming instructions, and these
    601 	// can't do anything without another rune.
    602 	if (r_or_end is io::EOF) {
    603 		threads[i].failed = true;
    604 		return;
    605 	};
    606 
    607 	const r = r_or_end as rune;
    608 
    609 	match (re.insts[threads[i].pc]) {
    610 	case inst_skip => return;
    611 	case let lit: inst_lit =>
    612 		if (r != lit) {
    613 			threads[i].failed = true;
    614 		};
    615 	case inst_any => void;
    616 	case let cs: inst_charset =>
    617 		const charset = re.charsets[cs.idx];
    618 		// Disprove the match if we're looking for a negative match
    619 		// Prove the match if we're looking for a positive match
    620 		let matched = !cs.is_positive;
    621 		for (let i = 0z; i < len(charset); i += 1) match (charset[i]) {
    622 		case let lit: charset_lit_item =>
    623 			if (r == lit) {
    624 				// Succeeded if positive match
    625 				// Failed if negative match
    626 				matched = cs.is_positive;
    627 				break;
    628 			};
    629 		case let range: charset_range_item =>
    630 			const r_b = r: u32;
    631 
    632 			if (r_b >= range.0 && r_b <= range.1) {
    633 				// Succeeded if positive match
    634 				// Failed if negative match
    635 				matched = cs.is_positive;
    636 				break;
    637 			};
    638 		case let classfn: charset_class_item =>
    639 			if (classfn(r)) {
    640 				// Succeeded if positive match
    641 				// Failed if negative match
    642 				matched = cs.is_positive;
    643 				break;
    644 			};
    645 		};
    646 		if (!matched) {
    647 			threads[i].failed = true;
    648 		};
    649 	};
    650 
    651 	threads[i].pc += 1;
    652 };
    653 
    654 // Attempts to match a regular expression against a string and returns the
    655 // either the longest leftmost match or all matches.
    656 fn search(
    657 	re: *regex,
    658 	string: str,
    659 	handle: io::handle,
    660 	need_captures: bool
    661 ) (void | []capture) = {
    662 	let threads: []thread = alloc([
    663 		thread { captures = [], ... }
    664 	]);
    665 	if (re.n_reps > 0) {
    666 		threads[0].rep_counters = alloc([0...], re.n_reps);
    667 	};
    668 	defer {
    669 		for (let i = 0z; i < len(threads); i += 1) {
    670 			free(threads[i].captures);
    671 			free(threads[i].rep_counters);
    672 		};
    673 		free(threads);
    674 	};
    675 
    676 	let str_idx = 0z;
    677 	let first_match_idx: (void | size) = void;
    678 	let str_bytesize = 0z;
    679 	let last_bytesize = 0z;
    680 
    681 	for (true) {
    682 		str_bytesize += last_bytesize;
    683 
    684 		if (len(threads) == 0) {
    685 			return void;
    686 		};
    687 
    688 		let all_matched = true;
    689 		for (let i = 0z; i < len(threads); i += 1) {
    690 			if (!threads[i].matched) {
    691 				all_matched = false;
    692 				break;
    693 			};
    694 		};
    695 
    696 		if (all_matched) {
    697 			let best_len = 0z;
    698 			let best_n_captures = 0z;
    699 			let best_idx = 0z;
    700 			for (let i = 0z; i < len(threads); i += 1) {
    701 				let match_len = threads[i].root_capture.end
    702 					- threads[i].root_capture.start;
    703 				const is_better = match_len > best_len
    704 					|| match_len == best_len
    705 					&& len(threads[i].captures)
    706 					> best_n_captures;
    707 				if (is_better) {
    708 					best_len = match_len;
    709 					best_idx = i;
    710 					best_n_captures = len(threads[i].captures);
    711 				};
    712 			};
    713 			let res: []capture = alloc([],
    714 				len(threads[best_idx].captures) + 1);
    715 			append(res, threads[best_idx].root_capture);
    716 			append(res, threads[best_idx].captures...);
    717 			return res;
    718 		};
    719 
    720 		const r_or_end = bufio::read_rune(handle)!;
    721 		if (r_or_end is rune) {
    722 			last_bytesize = utf8::runesz(r_or_end as rune);
    723 		};
    724 
    725 		for (let i = 0z; i < len(threads); i += 1) {
    726 			const res = run_thread(i, re, string, &threads,
    727 				r_or_end, str_idx, str_bytesize);
    728 			const matchlen = threads[i].root_capture.end
    729 				- threads[i].root_capture.start;
    730 			if (res is newmatch && matchlen > 0 && !need_captures) {
    731 				return [];
    732 			};
    733 			const is_better = res is newmatch && matchlen > 0
    734 				&& (first_match_idx is void
    735 					|| threads[i].start_idx
    736 						< first_match_idx as size);
    737 			if (is_better) {
    738 				first_match_idx = threads[i].start_idx;
    739 			};
    740 		};
    741 		str_idx += 1;
    742 
    743 		// When we only want the leftmost match, delete all threads that
    744 		// start after the earliest non-zero-length matched thread
    745 		if (first_match_idx is size) {
    746 			for (let thread &.. threads) {
    747 				if (thread.start_idx > first_match_idx as size) {
    748 					thread.failed = true;
    749 				};
    750 			};
    751 		};
    752 
    753 		// Delete threads that have a PC that has already been
    754 		// encountered in previous threads. Prioritise threads that
    755 		// have an earlier start_idx, and threads that were added
    756 		// earlier.
    757 		for (let i = 0i64; i < len(threads): i64 - 1; i += 1) {
    758 			for (let j = i + 1; j < len(threads): i64; j += 1) {
    759 				const same_pc = threads[i].pc == threads[j].pc;
    760 				const none_matched = !threads[j].matched
    761 					&& !threads[i].matched;
    762 				if (same_pc && none_matched) {
    763 					if (threads[i].start_idx
    764 							<= threads[j].start_idx) {
    765 						delete_thread(j: size, &threads);
    766 						j -= 1;
    767 					} else {
    768 						delete_thread(i: size, &threads);
    769 						i -= 1;
    770 						break;
    771 					};
    772 				};
    773 			};
    774 		};
    775 
    776 		for (let i = 0z; i < len(threads); i += 1) {
    777 			if (threads[i].failed) {
    778 				delete_thread(i, &threads);
    779 				i -= 1;
    780 			};
    781 		};
    782 	};
    783 };
    784 
    785 // Returns whether or not a [[regex]] matches any part of a given string.
    786 export fn test(re: *regex, string: str) bool = {
    787 	let strm = memio::fixed(strings::toutf8(string));
    788 	return search(re, string, &strm, false) is []capture;
    789 };
    790 
    791 
    792 // Attempts to match a [[regex]] against a string and returns the longest
    793 // leftmost match as a [[result]]. The caller must free the return value with
    794 // [[result_free]].
    795 export fn find(re: *regex, string: str) result = {
    796 	let strm = memio::fixed(strings::toutf8(string));
    797 	match (search(re, string, &strm, true)) {
    798 	case let m: []capture =>
    799 		return m;
    800 	case void =>
    801 		return [];
    802 	};
    803 };
    804 
    805 // Attempts to match a [[regex]] against a string and returns all
    806 // non-overlapping matches as a slice of [[result]]s. The caller must free the
    807 // return value with [[result_freeall]].
    808 export fn findall(re: *regex, string: str) []result = {
    809 	let res: []result = [];
    810 	let str_idx = 0z, str_bytesize = 0z;
    811 	let strm = memio::fixed(strings::toutf8(string));
    812 	const str_bytes = strings::toutf8(string);
    813 	for (true) {
    814 		let substring = strings::fromutf8_unsafe(
    815 			str_bytes[str_bytesize..]);
    816 		match (search(re, substring, &strm, true)) {
    817 		case let m: []capture =>
    818 			append(res, m);
    819 			m[0].start += str_idx;
    820 			m[0].end += str_idx;
    821 			m[0].start_bytesize += str_bytesize;
    822 			m[0].end_bytesize += str_bytesize;
    823 			str_idx = m[0].end;
    824 			str_bytesize = m[0].end_bytesize;
    825 			if (m[0].start_bytesize == len(str_bytes)) {
    826 				// end-of-string reached
    827 				break;
    828 			};
    829 			if (m[0].start_bytesize == m[0].end_bytesize) {
    830 				// zero-length match
    831 				// forward rune and byte indices
    832 				str_idx += 1;
    833 				str_bytesize += encoding::utf8::utf8sz(
    834 					str_bytes[str_bytesize])!;
    835 			};
    836 			io::seek(&strm, str_bytesize: io::off,
    837 				io::whence::SET)!;
    838 		case void => break;
    839 		};
    840 	};
    841 	return res;
    842 };
    843 
    844 // Replaces all non-overlapping matches of a regular expression against a string
    845 // with 'targetstr'.
    846 //
    847 // A backslash followed by a single decimal number within 'targetstr' is
    848 // replaced by the capture at that index (starting at 1), or an empty string if
    849 // no such capture exists. For example, `\1` is replaced with the first capture,
    850 // `\2` with the second, etc. `\0` is substituted with the entire substring that
    851 // was matched. `\\` is replaced with a literal backslash. The caller must free
    852 // the return value.
    853 //
    854 // An error is only returned if 'targetstr' isn't formatted correctly.
    855 export fn replace(re: *regex, string: str, targetstr: str) (str | error) = {
    856 	return replacen(re, string, targetstr, types::SIZE_MAX);
    857 };
    858 
    859 // Replaces up to 'n' non-overlapping matches of a regular expression against a
    860 // string with 'targetstr', in the same manner as [[replace]]. The caller must
    861 // free the return value.
    862 export fn replacen(
    863 	re: *regex,
    864 	string: str,
    865 	targetstr: str,
    866 	n: size,
    867 ) (str | error) = {
    868 	const target = parse_replace_target(targetstr)?;
    869 	defer free(target);
    870 	// Check if n == 0 after parse_replace_target so errors are propagated
    871 	if (n == 0) {
    872 		return strings::dup(string);
    873 	};
    874 
    875 	const matches = findall(re, string);
    876 	if (len(matches) == 0) {
    877 		return strings::dup(string);
    878 	};
    879 	defer result_freeall(matches);
    880 
    881 	const bytes = strings::toutf8(string);
    882 	let buf = alloc(bytes[..matches[0][0].start_bytesize]...);
    883 
    884 	const n = if (len(matches) > n) n else len(matches);
    885 	for (let i = 0z; i < n; i += 1) {
    886 		for (let j = 0z; j < len(target); j += 1) {
    887 			match (target[j]) {
    888 			case let b: []u8 =>
    889 				append(buf, b...);
    890 			case let z: size =>
    891 				if (z >= len(matches[i])) yield;
    892 				const b = strings::toutf8(matches[i][z].content);
    893 				append(buf, b...);
    894 			};
    895 		};
    896 		const start = matches[i][0].end_bytesize;
    897 		const end = if (i == n - 1) len(bytes)
    898 			else matches[i + 1][0].start_bytesize;
    899 		append(buf, bytes[start..end]...);
    900 	};
    901 
    902 	return strings::fromutf8(buf)!;
    903 };
    904 
    905 fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = {
    906 	const bytes = strings::toutf8(targetstr);
    907 	let target: []([]u8 | size) = alloc([], 1);
    908 	let iter = strings::iter(targetstr);
    909 	let start = 0z, end = 0z;
    910 	for (true) match (strings::next(&iter)) {
    911 	case done =>
    912 		if (start != end) {
    913 			append(target, bytes[start..]);
    914 		};
    915 		break;
    916 	case let r: rune =>
    917 		if (r == '\\') {
    918 			if (start != end) {
    919 				append(target, bytes[start..end]);
    920 			};
    921 
    922 			const r = match (strings::next(&iter)) {
    923 			case done =>
    924 				return "Trailing backslash": error;
    925 			case let r: rune =>
    926 				yield r;
    927 			};
    928 
    929 			if (r == '\\') {
    930 				append(target, '\\');
    931 			} else if (ascii::isdigit(r)) {
    932 				append(target, r: u32: size - 0x30);
    933 			} else {
    934 				return "Backslash must be followed by positive decimal number or a backslash": error;
    935 			};
    936 
    937 			end += 2;
    938 			start = end;
    939 		} else {
    940 			end += utf8::runesz(r);
    941 		};
    942 	};
    943 
    944 	return target;
    945 };
    946 
    947 // Replaces all non-overlapping matches of a regular expression against a string
    948 // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all
    949 // backslashes are treated literally. The caller must free the return value.
    950 export fn rawreplace(re: *regex, string: str, targetstr: str) str = {
    951 	return rawreplacen(re, string, targetstr, types::SIZE_MAX);
    952 };
    953 
    954 // Replaces up to 'n' non-overlapping matches of a regular expression against a
    955 // string with 'targetstr', in the same manner as [[rawreplace]]. The caller
    956 // must free the return value.
    957 export fn rawreplacen(re: *regex, string: str, targetstr: str, n: size) str = {
    958 	if (n == 0) {
    959 		return strings::dup(string);
    960 	};
    961 
    962 	const matches = findall(re, string);
    963 	if (len(matches) == 0) {
    964 		return strings::dup(string);
    965 	};
    966 	defer result_freeall(matches);
    967 
    968 	const target = strings::toutf8(targetstr);
    969 	const bytes = strings::toutf8(string);
    970 	let buf: []u8 = [];
    971 
    972 	append(buf, bytes[..matches[0][0].start_bytesize]...);
    973 	const n = if (len(matches) > n) n else len(matches);
    974 	for (let i = 1z; i < n; i += 1) {
    975 		append(buf, target...);
    976 		const start = matches[i - 1][0].end_bytesize;
    977 		const end = matches[i][0].start_bytesize;
    978 		append(buf, bytes[start..end]...);
    979 	};
    980 	append(buf, target...);
    981 	append(buf, bytes[matches[n - 1][0].end_bytesize..]...);
    982 
    983 	return strings::fromutf8(buf)!;
    984 };
    985 
    986 // Frees a [[result]].
    987 export fn result_free(s: result) void = {
    988 	free(s);
    989 };
    990 
    991 // Frees a slice of [[result]]s.
    992 export fn result_freeall(s: []result) void = {
    993 	for (let res .. s) {
    994 		result_free(res);
    995 	};
    996 	free(s);
    997 };
    998 
    999 // Converts an [[error]] into a user-friendly string.
   1000 export fn strerror(err: error) str = err;