regex.ha - hare - [hare] The Hare programming language

regex.ha (29397B)
      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use ascii;
      5 use bufio;
      6 use encoding::utf8;
      7 use io;
      8 use memio;
      9 use strconv;
     10 use strings;
     11 use types;
     12 
     13 // An error string describing a compilation error.
     14 export type error = !str;
     15 
     16 export type inst_lit = rune,
     17 	inst_charset = struct { idx: size, is_positive: bool },
     18 	inst_any = void,
     19 	inst_split = size,
     20 	inst_jump = size,
     21 	inst_skip = void,
     22 	inst_match = bool,
     23 	inst_groupstart = size,
     24 	inst_groupend = void,
     25 	inst_repeat = struct {
     26 		id: size,
     27 		origin: size,
     28 		min: (void | size),
     29 		max: (void | size),
     30 	};
     31 
     32 export type inst = (inst_lit | inst_any | inst_split | inst_jump |
     33 	inst_skip | inst_match | inst_charset |
     34 	inst_groupstart | inst_groupend |
     35 	inst_repeat);
     36 
     37 // The resulting match of a [[regex]] applied to a string.
     38 //
     39 // The first [[capture]] corresponds to the implicit zeroth capture group,
     40 // i.e. the whole expression.
     41 //
     42 // The rest of the [[capture]]s correspond to the rest of the capture groups,
     43 // i.e. the sub-expressions.
     44 export type result = []capture;
     45 
     46 // A (sub)match corresponding to a regular expression's capture group.
     47 export type capture = struct {
     48 	content: str,
     49 	start: size,
     50 	start_bytesize: size,
     51 	end: size,
     52 	end_bytesize: size
     53 };
     54 
     55 type thread = struct {
     56 	pc: size,
     57 	start_idx: size,
     58 	start_bytesize: size,
     59 	root_capture: capture,
     60 	captures: []capture,
     61 	rep_counters: []size,
     62 	matched: bool,
     63 	failed: bool,
     64 };
     65 
     66 type newmatch = void;
     67 
     68 export type charset = [](charset_lit_item | charset_range_item |
     69 	charset_class_item),
     70 	charset_lit_item = rune,
     71 	charset_range_item = (u32, u32),
     72 	charset_class_item = (str, *fn(c: rune) bool);
     73 
     74 const charclass_map: [](str, *fn(c: rune) bool) = [
     75 	(":alnum:]", &ascii::isalnum),
     76 	(":alpha:]", &ascii::isalpha),
     77 	(":blank:]", &ascii::isblank),
     78 	(":cntrl:]", &ascii::iscntrl),
     79 	(":digit:]", &ascii::isdigit),
     80 	(":graph:]", &ascii::isgraph),
     81 	(":lower:]", &ascii::islower),
     82 	(":print:]", &ascii::isprint),
     83 	(":punct:]", &ascii::ispunct),
     84 	(":space:]", &ascii::isspace),
     85 	(":upper:]", &ascii::isupper),
     86 	(":xdigit:]", &ascii::isxdigit),
     87 ];
     88 
     89 export type regex = struct {
     90 	insts: []inst,
     91 	charsets: []charset,
     92 	n_reps: size,
     93 };
     94 
     95 // Frees resources associated with a [[regex]].
     96 export fn finish(re: *regex) void = {
     97 	free(re.insts);
     98 	for (let charset .. re.charsets) {
     99 		free(charset);
    100 	};
    101 	free(re.charsets);
    102 };
    103 
    104 fn find_last_groupstart(insts: []inst) (size | error) = {
    105 	let nested = 0u;
    106 	for (let i = len(insts); i > 0; i -= 1) {
    107 		match (insts[i - 1]) {
    108 		case inst_groupstart =>
    109 			if (nested == 0) {
    110 				return i - 1;
    111 			};
    112 			nested -= 1;
    113 		case inst_groupend =>
    114 			nested += 1;
    115 		case => void;
    116 		};
    117 	};
    118 	return `Unmatched ')'`: error;
    119 };
    120 
    121 // increments all [[inst_jump]] and [[inst_split]] instructions to account for a
    122 // newly inserted instruction before the given slice
    123 fn shift(sl: []inst) void = {
    124 	for (let inst &.. sl) {
    125 		match (*inst) {
    126 		case let z: inst_jump =>
    127 			*inst = (z + 1): inst_jump;
    128 		case let z: inst_split =>
    129 			*inst = (z + 1): inst_split;
    130 		case => void;
    131 		};
    132 	};
    133 };
    134 
    135 fn handle_bracket(
    136 	insts: *[]inst,
    137 	r: rune,
    138 	r_idx: *size,
    139 	bracket_idx: *int,
    140 	iter: *strings::iterator,
    141 	charsets: *[]charset,
    142 	skip_charclass_rest: *bool,
    143 	is_charset_positive: *bool,
    144 	in_bracket: *bool
    145 ) (void | error) = {
    146 	const peek1 = strings::next(iter);
    147 	const peek2 = strings::next(iter);
    148 	const peek3 = strings::next(iter);
    149 	if (!(peek1 is done)) {
    150 		strings::prev(iter);
    151 	};
    152 	if (!(peek2 is done)) {
    153 		strings::prev(iter);
    154 	};
    155 	if (!(peek3 is done)) {
    156 		strings::prev(iter);
    157 	};
    158 
    159 	if (*bracket_idx == -1) {
    160 		append(charsets, [])!;
    161 	};
    162 	*bracket_idx += 1;
    163 
    164 	if (*skip_charclass_rest) {
    165 		if (r == ']') {
    166 			*skip_charclass_rest = false;
    167 		};
    168 		*r_idx += 1;
    169 		return;
    170 	};
    171 
    172 	const is_range = peek1 is rune && peek1 as rune == '-'
    173 		&& !(peek2 is done) && !(peek3 is done)
    174 		&& !(peek2 as rune == ']');
    175 	const range_end = peek2;
    176 	const is_first_char = *bracket_idx == 0 || *bracket_idx == 1
    177 		&& !*is_charset_positive;
    178 
    179 	if (r == '\\') {
    180 		if (peek1 is done) {
    181 			return `Trailing backslash '\'`: error;
    182 		} else {
    183 			append(charsets[len(charsets) - 1],
    184 				peek1: charset_lit_item)!;
    185 			strings::next(iter);
    186 			*r_idx += 1;
    187 		};
    188 	} else if (r == ']' && !is_first_char) {
    189 		const newinst = inst_charset {
    190 			idx = len(charsets) - 1,
    191 			is_positive = *is_charset_positive,
    192 		};
    193 		append(insts, newinst)!;
    194 		*in_bracket = false;
    195 		*bracket_idx = -1;
    196 		*is_charset_positive = true;
    197 	} else if (r == '^' && *bracket_idx == 0) {
    198 		*is_charset_positive = false;
    199 	} else if (r == '[' && !(peek1 is done)
    200 			&& peek1 as rune == ':') {
    201 		const rest = strings::iterstr(iter);
    202 		const n_cc = len(charclass_map);
    203 		for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
    204 			if (strings::hasprefix(rest, charclass_map[cc_idx].0)) {
    205 				append(charsets[len(charsets) - 1],
    206 					charclass_map[cc_idx])!;
    207 				*skip_charclass_rest = true;
    208 				break;
    209 			};
    210 		};
    211 		if (!*skip_charclass_rest) {
    212 			return `No character class after '[:'`: error;
    213 		};
    214 	} else if (is_range) {
    215 		const start_b = r: u32;
    216 		const end_b = range_end as rune: u32;
    217 
    218 		if (end_b < start_b) {
    219 			return `Descending bracket expression range '[z-a]'`: error;
    220 		};
    221 
    222 		append(charsets[len(charsets) - 1],
    223 			(start_b, end_b): charset_range_item)!;
    224 		strings::next(iter);
    225 		strings::next(iter);
    226 		*r_idx += 2;
    227 	} else {
    228 		append(charsets[len(charsets) - 1],
    229 			r: charset_lit_item)!;
    230 	};
    231 
    232 	*r_idx += 1;
    233 };
    234 
    235 // Compiles a regular expression string into a [[regex]].
    236 export fn compile(expr: str) (regex | error) = {
    237 	let insts: []inst = [];
    238 	let charsets: []charset = [];
    239 	let iter = strings::iter(expr);
    240 	let r_idx = 0z;
    241 	let jump_idxs: [][]size = [];
    242 	append(jump_idxs, [])!;
    243 	defer free(jump_idxs);
    244 	let in_bracket = false;
    245 	let skip_charclass_rest = false;
    246 	let bracket_idx = -1;
    247 	let is_charset_positive = true;
    248 	let was_prev_rune_pipe = false;
    249 	let n_reps = 0z;
    250 	let group_level = 0z;
    251 	let capture_idx = 0z;
    252 
    253 	for (true) {
    254 		const next = strings::next(&iter);
    255 
    256 		if (r_idx == 0 && next is rune && next: rune != '^') {
    257 			append(insts, inst_skip)!;
    258 		};
    259 
    260 		if (in_bracket) {
    261 			if (next is done) {
    262 				return `Unmatched '['`: error;
    263 			};
    264 			const r = next: rune;
    265 			handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter,
    266 				&charsets, &skip_charclass_rest,
    267 				&is_charset_positive,
    268 				&in_bracket)?;
    269 			continue;
    270 		};
    271 
    272 		const r = match (next) {
    273 		case done =>
    274 			if (group_level > 0) {
    275 				return `Unmatched '('`: error;
    276 			};
    277 			break;
    278 		case let r: rune => yield r;
    279 		};
    280 		switch (r) {
    281 		case '\\' =>
    282 			const peek1 = strings::next(&iter);
    283 			if (peek1 is done) {
    284 				return `Trailing backslash '\'`: error;
    285 			} else {
    286 				append(insts, (peek1 as rune): inst_lit)!;
    287 				r_idx += 1;
    288 			};
    289 		case '^' =>
    290 			if (group_level > 0) {
    291 				return `Anchor '^' in capture groups is unsupported`: error;
    292 			};
    293 			if (!(r_idx == 0 || was_prev_rune_pipe)) {
    294 				return `Anchor '^' not at start of whole pattern or alternation`: error;
    295 			};
    296 		case '$' =>
    297 			if (group_level > 0) {
    298 				return `Anchor '$' in capture groups is unsupported`: error;
    299 			};
    300 			const peek1 = strings::next(&iter);
    301 			if (peek1 is rune) {
    302 				if (peek1 as rune != '|') {
    303 					return `Anchor '$' not at end of whole pattern or alternation`: error;
    304 				};
    305 				strings::prev(&iter);
    306 			};
    307 			append(insts, true: inst_match)!;
    308 		case '[' =>
    309 			in_bracket = true;
    310 		case ']' =>
    311 			append(insts, r: inst_lit)!;
    312 		case '(' =>
    313 			append(insts, capture_idx: inst_groupstart)!;
    314 			group_level += 1;
    315 			capture_idx += 1;
    316 			for (len(jump_idxs) < group_level + 1) {
    317 				append(jump_idxs, [])!;
    318 			};
    319 		case ')' =>
    320 			if (group_level == 0) {
    321 				return `Unmatched ')'`: error;
    322 			};
    323 			append(insts, inst_groupend)!;
    324 			for (let jump_idx .. jump_idxs[group_level]) {
    325 				assert(insts[jump_idx] is inst_jump);
    326 				insts[jump_idx] = (len(insts) - 1): inst_jump;
    327 			};
    328 			delete(jump_idxs[group_level][..]);
    329 			group_level -= 1;
    330 		case '|' =>
    331 			append(insts, types::SIZE_MAX: inst_jump)!;
    332 			const origin = match (find_last_groupstart(insts)) {
    333 			case error =>
    334 				yield 0z;
    335 			case let sz: size =>
    336 				yield sz + 1;
    337 			};
    338 			const newinst = (len(insts) + 1): inst_split;
    339 			// add split after last jump (if any) or at origin
    340 			const split_idx = if (len(jump_idxs[group_level]) > 0)
    341 				jump_idxs[group_level][len(jump_idxs[group_level]) - 1] + 1 else origin;
    342 			insert(insts[split_idx], newinst)!;
    343 			shift(insts[split_idx + 1..]);
    344 			// our insertion of our split_idx should never interfere
    345 			// with an existing jump_idx
    346 			for (let jump_idx .. jump_idxs[group_level]) {
    347 				// if this assertion ends up being hit in the
    348 				// future, it is a sign that jump_idx should be
    349 				// incremented
    350 				assert(jump_idx < split_idx, `Found jump_idx interference. Please report this as a bug`);
    351 			};
    352 			append(jump_idxs[group_level], len(insts) - 1)!;
    353 			// add skip if it's a whole-expression alternation
    354 			if (origin == 0) {
    355 				const peek1 = strings::next(&iter);
    356 				if (peek1 is rune) {
    357 					if (peek1 as rune != '^') {
    358 						append(insts, inst_skip)!;
    359 					};
    360 					strings::prev(&iter);
    361 				};
    362 			};
    363 		case '{' =>
    364 			let origin = len(insts) - 1;
    365 			if (insts[origin] is inst_groupend) {
    366 				origin = find_last_groupstart(insts[..origin])?;
    367 			};
    368 			const rest = strings::iterstr(&iter);
    369 			const rep_parts = parse_repetition(rest)?;
    370 			const can_skip = rep_parts.0 == 0;
    371 			const min = if (rep_parts.0 == 0) {
    372 				yield 1z;
    373 			} else {
    374 				yield rep_parts.0;
    375 			};
    376 			if (can_skip) {
    377 				// len(insts) - 1 is the current last instruction
    378 				// len(insts) is the next instruction
    379 				// advance to len(insts) + 1 to make space for the `inst_split`
    380 				// advance to len(insts) + 2 to make space for the `inst_repeat`
    381 				insert(insts[origin],
    382 					len(insts) + 2: inst_split)!;
    383 				shift(insts[origin + 1..]);
    384 				origin += 1;
    385 			};
    386 			const newinst = inst_repeat {
    387 				id = n_reps,
    388 				origin = origin,
    389 				min = min,
    390 				max = rep_parts.1,
    391 			};
    392 			for (let i = 0z; i <= rep_parts.2; i += 1) {
    393 				strings::next(&iter);
    394 				r_idx += 1;
    395 			};
    396 			append(insts, newinst)!;
    397 			n_reps += 1;
    398 		case '?' =>
    399 			if (r_idx == 0 || len(insts) == 0) {
    400 				return `Unused '?'`: error;
    401 			};
    402 			let term_start_idx = len(insts) - 1;
    403 			match (insts[term_start_idx]) {
    404 			case (inst_lit | inst_charset | inst_any) => void;
    405 			case inst_groupend =>
    406 				term_start_idx = find_last_groupstart(
    407 					insts[..term_start_idx])?;
    408 			case inst_groupstart =>
    409 				return `Unused '?'`: error;
    410 			case =>
    411 				return `Misused '?'`: error;
    412 			};
    413 			const after_idx = len(insts) + 1;
    414 			insert(insts[term_start_idx], after_idx: inst_split)!;
    415 			shift(insts[term_start_idx + 1..]);
    416 		case '*' =>
    417 			if (r_idx == 0 || len(insts) == 0) {
    418 				return `Unused '*'`: error;
    419 			};
    420 			const new_inst_offset = 1z;
    421 			const jump_idx = len(insts) + new_inst_offset;
    422 			const after_idx = jump_idx + 1z;
    423 			let term_start_idx = len(insts) - 1z;
    424 			match (insts[term_start_idx]) {
    425 			case (inst_lit | inst_charset | inst_any) => void;
    426 			case inst_groupend =>
    427 				term_start_idx = find_last_groupstart(
    428 					insts[..term_start_idx])?;
    429 			case inst_groupstart =>
    430 				return `Unused '*'`: error;
    431 			case =>
    432 				return `Misused '*'`: error;
    433 			};
    434 			const split_idx = term_start_idx;
    435 			term_start_idx += new_inst_offset;
    436 			insert(insts[split_idx], after_idx: inst_split)!;
    437 			shift(insts[split_idx + 1..]);
    438 			append(insts, split_idx: inst_jump)!;
    439 		case '+' =>
    440 			if (r_idx == 0 || len(insts) == 0) {
    441 				return `Unused '+'`: error;
    442 			};
    443 			let term_start_idx = len(insts) - 1;
    444 			match (insts[term_start_idx]) {
    445 			case (inst_lit | inst_charset | inst_any) => void;
    446 			case inst_groupend =>
    447 				term_start_idx = find_last_groupstart(
    448 					insts[..term_start_idx])?;
    449 			case inst_groupstart =>
    450 				return `Unused '+'`: error;
    451 			case =>
    452 				return `Misused '+'`: error;
    453 			};
    454 			append(insts, term_start_idx: inst_split)!;
    455 		case '.' =>
    456 			append(insts, inst_any)!;
    457 		case =>
    458 			append(insts, r: inst_lit)!;
    459 		};
    460 		was_prev_rune_pipe = (r == '|');
    461 		r_idx += 1;
    462 	};
    463 
    464 	// handle whole expression alternation
    465 	for (let jump_idx .. jump_idxs[0]) {
    466 		assert(insts[jump_idx] is inst_jump);
    467 		insts[jump_idx] = len(insts): inst_jump;
    468 	};
    469 
    470 	if (len(insts) == 0 || !(insts[len(insts) - 1] is inst_match)) {
    471 		append(insts, false: inst_match)!;
    472 	};
    473 
    474 	return regex {
    475 		insts = insts,
    476 		charsets = charsets,
    477 		n_reps = n_reps,
    478 	};
    479 };
    480 
    481 fn parse_repetition(
    482 	s: str
    483 ) (((void | size), (void | size), size) | error) = {
    484 	const first_comma = strings::index(s, ",");
    485 	const first_endbrace = strings::index(s, "}");
    486 	if (first_endbrace is void) {
    487 		return `Repetition expression syntax error '{n}'`: error;
    488 	};
    489 	const first_endbrace = first_endbrace as size;
    490 
    491 	let min_str = "";
    492 	let max_str = "";
    493 	let is_single_arg = false;
    494 	if (first_comma is void || first_endbrace < first_comma as size) {
    495 		const cut = strings::cut(s, "}");
    496 		min_str = cut.0;
    497 		max_str = cut.0;
    498 		is_single_arg = true;
    499 	} else {
    500 		const cut = strings::cut(s, ",");
    501 		min_str = cut.0;
    502 		max_str = strings::cut(cut.1, "}").0;
    503 	};
    504 
    505 	let min: (void | size) = void;
    506 	let max: (void | size) = void;
    507 
    508 	if (len(min_str) > 0) {
    509 		min = match (strconv::stoi(min_str)) {
    510 		case let res: int =>
    511 			yield if (res < 0) {
    512 				return `Negative repetition count '{-n}'`: error;
    513 			} else {
    514 				yield res: size;
    515 			};
    516 		case => return `Repetition expression syntax error '{n}'`: error;
    517 		};
    518 	} else {
    519 		min = 0;
    520 	};
    521 
    522 	if (len(max_str) > 0) {
    523 		max = match (strconv::stoi(max_str)) {
    524 		case let res: int =>
    525 			yield if (res < 0) {
    526 				return `Negative repetition count '{-n}'`: error;
    527 			} else {
    528 				yield res: size;
    529 			};
    530 		case => return `Repetition expression syntax error '{n}'`: error;
    531 		};
    532 	};
    533 
    534 	const rep_len = if (is_single_arg) {
    535 		yield len(min_str);
    536 	} else {
    537 		yield len(min_str) + 1 + len(max_str);
    538 	};
    539 	return (min, max, rep_len);
    540 };
    541 
    542 fn delete_thread(i: size, threads: *[]thread) void = {
    543 	free(threads[i].captures);
    544 	free(threads[i].rep_counters);
    545 	delete(threads[i]);
    546 };
    547 
    548 fn is_consuming_inst(a: inst) bool = {
    549 	return a is (inst_lit | inst_any | inst_charset);
    550 };
    551 
    552 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
    553 	// Do not add this thread if there is already another thread with
    554 	// the same PC
    555 	for (let thread &.. *threads) {
    556 		if (thread.pc == new_pc	&& !thread.matched
    557 				&& thread.start_idx
    558 				< threads[parent_idx].start_idx) {
    559 			return;
    560 		};
    561 	};
    562 
    563 	append(threads, thread {
    564 		pc = new_pc,
    565 		start_idx = threads[parent_idx].start_idx,
    566 		start_bytesize = threads[parent_idx].start_bytesize,
    567 		matched = threads[parent_idx].matched,
    568 		failed = threads[parent_idx].failed,
    569 		captures = alloc(threads[parent_idx].captures...)!,
    570 		rep_counters = alloc(threads[parent_idx].rep_counters...)!,
    571 		...
    572 	})!;
    573 };
    574 
    575 fn run_thread(
    576 	i: size,
    577 	re: *regex,
    578 	string: str,
    579 	threads: *[]thread,
    580 	r_or_end: (rune | io::EOF),
    581 	str_idx: size,
    582 	str_bytesize: size
    583 ) (void | newmatch) = {
    584 	const str_bytes = strings::toutf8(string);
    585 	if (threads[i].matched) {
    586 		return;
    587 	};
    588 	for (!is_consuming_inst(re.insts[threads[i].pc])) {
    589 		match (re.insts[threads[i].pc]) {
    590 		case inst_lit => abort();
    591 		case inst_any => abort();
    592 		case inst_split =>
    593 			const new_pc = re.insts[threads[i].pc]: inst_split: size;
    594 			add_thread(threads, i, new_pc);
    595 			threads[i].pc += 1;
    596 		case inst_jump =>
    597 			threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
    598 		case inst_skip =>
    599 			const new_pc = threads[i].pc + 1;
    600 			threads[i].start_idx = str_idx;
    601 			threads[i].start_bytesize = str_bytesize;
    602 			add_thread(threads, i, new_pc);
    603 			break;
    604 		case let anchored: inst_match =>
    605 			// Do not match if we need an end-anchored match, but we
    606 			// have not exhausted our string
    607 			if (anchored && !(r_or_end is io::EOF)) {
    608 				threads[i].failed = true;
    609 				return;
    610 			};
    611 			const content = strings::fromutf8_unsafe(str_bytes[
    612 				threads[i].start_bytesize..str_bytesize]);
    613 			threads[i].root_capture = capture {
    614 				start = threads[i].start_idx,
    615 				start_bytesize = threads[i].start_bytesize,
    616 				end = str_idx,
    617 				end_bytesize = str_bytesize,
    618 				content = content,
    619 			};
    620 			threads[i].matched = true;
    621 			return newmatch;
    622 		case let idx: inst_groupstart =>
    623 			if (idx >= len(threads[i].captures)) {
    624 				append(threads[i].captures,
    625 					[capture { ... }...],
    626 					idx - len(threads[i].captures) + 1)!;
    627 			};
    628 			assert(threads[i].captures[idx].end != types::SIZE_MAX);
    629 			threads[i].captures[idx] = capture {
    630 				content = "",
    631 				start = str_idx,
    632 				start_bytesize = str_bytesize,
    633 				// end=types::SIZE_MAX indicates that the
    634 				// capture group hasn't ended yet
    635 				end = types::SIZE_MAX,
    636 				end_bytesize = types::SIZE_MAX,
    637 			};
    638 			threads[i].pc += 1;
    639 		case inst_groupend =>
    640 			let curr_capture = len(threads[i].captures);
    641 			for (curr_capture > 0; curr_capture -= 1) {
    642 				// find inner-most unclosed capture group
    643 				if (threads[i].captures[curr_capture - 1].end
    644 						== types::SIZE_MAX) {
    645 					break;
    646 				};
    647 			};
    648 			assert(curr_capture > 0, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`);
    649 			let capture = &threads[i].captures[curr_capture - 1];
    650 			capture.end = str_idx;
    651 			capture.end_bytesize = str_bytesize;
    652 			capture.content = strings::fromutf8_unsafe(str_bytes[
    653 				capture.start_bytesize..capture.end_bytesize]);
    654 			threads[i].pc += 1;
    655 		case let ir: inst_repeat =>
    656 			assert(ir.id < len(threads[i].rep_counters));
    657 			threads[i].rep_counters[ir.id] += 1;
    658 			if (ir.max is size
    659 					&& threads[i].rep_counters[ir.id]
    660 					> ir.max as size) {
    661 				threads[i].failed = true;
    662 				return;
    663 			};
    664 			const new_pc = threads[i].pc + 1;
    665 			threads[i].pc = ir.origin;
    666 			if (ir.min is void
    667 					|| threads[i].rep_counters[ir.id]
    668 					>= ir.min as size) {
    669 				add_thread(threads, i, new_pc);
    670 			};
    671 		};
    672 	};
    673 
    674 	// From now on, we're only matching consuming instructions, and these
    675 	// can't do anything without another rune.
    676 	if (r_or_end is io::EOF) {
    677 		threads[i].failed = true;
    678 		return;
    679 	};
    680 
    681 	const r = r_or_end as rune;
    682 
    683 	match (re.insts[threads[i].pc]) {
    684 	case inst_skip => return;
    685 	case let lit: inst_lit =>
    686 		if (r != lit) {
    687 			threads[i].failed = true;
    688 		};
    689 	case inst_any => void;
    690 	case let cs: inst_charset =>
    691 		const charset = re.charsets[cs.idx];
    692 		// Disprove the match if we're looking for a negative match
    693 		// Prove the match if we're looking for a positive match
    694 		let matched = !cs.is_positive;
    695 		for (let i = 0z; i < len(charset); i += 1) match (charset[i]) {
    696 		case let lit: charset_lit_item =>
    697 			if (r == lit) {
    698 				// Succeeded if positive match
    699 				// Failed if negative match
    700 				matched = cs.is_positive;
    701 				break;
    702 			};
    703 		case let range: charset_range_item =>
    704 			const r_b = r: u32;
    705 
    706 			if (r_b >= range.0 && r_b <= range.1) {
    707 				// Succeeded if positive match
    708 				// Failed if negative match
    709 				matched = cs.is_positive;
    710 				break;
    711 			};
    712 		case let class_item: charset_class_item =>
    713 			const classfn = class_item.1;
    714 			if (classfn(r)) {
    715 				// Succeeded if positive match
    716 				// Failed if negative match
    717 				matched = cs.is_positive;
    718 				break;
    719 			};
    720 		};
    721 		if (!matched) {
    722 			threads[i].failed = true;
    723 		};
    724 	case => abort(); // unreachable
    725 	};
    726 
    727 	threads[i].pc += 1;
    728 };
    729 
    730 // Attempts to match a regular expression against a string and returns the
    731 // either the longest leftmost match or all matches.
    732 fn search(
    733 	re: *regex,
    734 	string: str,
    735 	handle: io::handle,
    736 	need_captures: bool
    737 ) (void | []capture) = {
    738 	let threads: []thread = alloc([
    739 		thread { captures = [], ... }
    740 	])!;
    741 	if (re.n_reps > 0) {
    742 		threads[0].rep_counters = alloc([0...], re.n_reps)!;
    743 	};
    744 	defer {
    745 		for (let i = 0z; i < len(threads); i += 1) {
    746 			free(threads[i].captures);
    747 			free(threads[i].rep_counters);
    748 		};
    749 		free(threads);
    750 	};
    751 
    752 	let str_idx = 0z;
    753 	let first_match_idx: (void | size) = void;
    754 	let str_bytesize = 0z;
    755 	let last_bytesize = 0z;
    756 
    757 	const scan = bufio::newscanner(handle);
    758 	defer bufio::finish(&scan);
    759 	for (true) {
    760 		str_bytesize += last_bytesize;
    761 
    762 		if (len(threads) == 0) {
    763 			return void;
    764 		};
    765 
    766 		let all_matched = true;
    767 		for (let i = 0z; i < len(threads); i += 1) {
    768 			if (!threads[i].matched) {
    769 				all_matched = false;
    770 				break;
    771 			};
    772 		};
    773 
    774 		if (all_matched) {
    775 			let best_len = 0z;
    776 			let best_n_captures = 0z;
    777 			let best_idx = 0z;
    778 			for (let i = 0z; i < len(threads); i += 1) {
    779 				let match_len = threads[i].root_capture.end
    780 					- threads[i].root_capture.start;
    781 				const is_better = match_len > best_len
    782 					|| match_len == best_len
    783 					&& len(threads[i].captures)
    784 					> best_n_captures;
    785 				if (is_better) {
    786 					best_len = match_len;
    787 					best_idx = i;
    788 					best_n_captures = len(threads[i].captures);
    789 				};
    790 			};
    791 
    792 			// length = number of captures (index of final group +
    793 			// 1) + root capture
    794 			let length = 1z;
    795 			for (let i = len(re.insts); i > 0; i -= 1) {
    796 				match (re.insts[i - 1]) {
    797 				case let z: inst_groupstart =>
    798 					length = z + 2;
    799 					break;
    800 				case => void;
    801 				};
    802 			};
    803 			let res: result = alloc([], length)!;
    804 			static append(res, threads[best_idx].root_capture)!;
    805 			static append(res, threads[best_idx].captures...)!;
    806 			if (length != len(res)) {
    807 				static append(res, [capture { ... }...],
    808 					length - len(res))!;
    809 			};
    810 			return res;
    811 		};
    812 
    813 		const r_or_end = bufio::scan_rune(&scan)!;
    814 		if (r_or_end is rune) {
    815 			last_bytesize = utf8::runesz(r_or_end as rune);
    816 		};
    817 
    818 		for (let i = 0z; i < len(threads); i += 1) {
    819 			const res = run_thread(i, re, string, &threads,
    820 				r_or_end, str_idx, str_bytesize);
    821 			const matchlen = threads[i].root_capture.end
    822 				- threads[i].root_capture.start;
    823 			if (res is newmatch && matchlen > 0 && !need_captures) {
    824 				return [];
    825 			};
    826 			const is_better = res is newmatch && matchlen > 0
    827 				&& (first_match_idx is void
    828 					|| threads[i].start_idx
    829 						< first_match_idx as size);
    830 			if (is_better) {
    831 				first_match_idx = threads[i].start_idx;
    832 			};
    833 		};
    834 		str_idx += 1;
    835 
    836 		// When we only want the leftmost match, delete all threads that
    837 		// start after the earliest non-zero-length matched thread
    838 		if (first_match_idx is size) {
    839 			for (let thread &.. threads) {
    840 				if (thread.start_idx > first_match_idx as size) {
    841 					thread.failed = true;
    842 				};
    843 			};
    844 		};
    845 
    846 		// Delete threads that have a PC that has already been
    847 		// encountered in previous threads. Prioritise threads that
    848 		// have an earlier start_idx, and threads that were added
    849 		// earlier.
    850 		for (let i = 0i64; i < len(threads): i64 - 1; i += 1) {
    851 			for (let j = i + 1; j < len(threads): i64; j += 1) {
    852 				const same_pc = threads[i].pc == threads[j].pc;
    853 				const none_matched = !threads[j].matched
    854 					&& !threads[i].matched;
    855 				if (same_pc && none_matched) {
    856 					if (threads[i].start_idx
    857 							<= threads[j].start_idx) {
    858 						delete_thread(j: size, &threads);
    859 						j -= 1;
    860 					} else {
    861 						delete_thread(i: size, &threads);
    862 						i -= 1;
    863 						break;
    864 					};
    865 				};
    866 			};
    867 		};
    868 
    869 		for (let i = 0z; i < len(threads); i += 1) {
    870 			if (threads[i].failed) {
    871 				delete_thread(i, &threads);
    872 				i -= 1;
    873 			};
    874 		};
    875 	};
    876 };
    877 
    878 // Returns whether or not a [[regex]] matches any part of a given string.
    879 export fn test(re: *regex, string: str) bool = {
    880 	let strm = memio::fixed(strings::toutf8(string));
    881 	return search(re, string, &strm, false) is []capture;
    882 };
    883 
    884 
    885 // Attempts to match a [[regex]] against a string and returns the longest
    886 // leftmost match as a [[result]]. The caller must free the return value with
    887 // [[result_free]].
    888 export fn find(re: *regex, string: str) result = {
    889 	let strm = memio::fixed(strings::toutf8(string));
    890 	match (search(re, string, &strm, true)) {
    891 	case let m: []capture =>
    892 		return m;
    893 	case void =>
    894 		return [];
    895 	};
    896 };
    897 
    898 // Attempts to match a [[regex]] against a string and returns all
    899 // non-overlapping matches as a slice of [[result]]s. The caller must free the
    900 // return value with [[result_freeall]].
    901 export fn findall(re: *regex, string: str) []result = {
    902 	let res: []result = [];
    903 	let str_idx = 0z, str_bytesize = 0z;
    904 	let strm = memio::fixed(strings::toutf8(string));
    905 	const str_bytes = strings::toutf8(string);
    906 	for (true) {
    907 		let substring = strings::fromutf8_unsafe(
    908 			str_bytes[str_bytesize..]);
    909 		match (search(re, substring, &strm, true)) {
    910 		case let m: []capture =>
    911 			append(res, m)!;
    912 			m[0].start += str_idx;
    913 			m[0].end += str_idx;
    914 			m[0].start_bytesize += str_bytesize;
    915 			m[0].end_bytesize += str_bytesize;
    916 			str_idx = m[0].end;
    917 			str_bytesize = m[0].end_bytesize;
    918 			if (m[0].start_bytesize == len(str_bytes)) {
    919 				// end-of-string reached
    920 				break;
    921 			};
    922 			if (m[0].start_bytesize == m[0].end_bytesize) {
    923 				// zero-length match
    924 				// forward rune and byte indices
    925 				str_idx += 1;
    926 				str_bytesize += utf8::utf8sz(
    927 					str_bytes[str_bytesize])!;
    928 			};
    929 			io::seek(&strm, str_bytesize: io::off,
    930 				io::whence::SET)!;
    931 		case void => break;
    932 		};
    933 	};
    934 	return res;
    935 };
    936 
    937 // Replaces all non-overlapping matches of a regular expression against a string
    938 // with 'targetstr'.
    939 //
    940 // A backslash followed by a single decimal number within 'targetstr' is
    941 // replaced by the capture at that index (starting at 1), or an empty string if
    942 // no such capture exists. For example, `\1` is replaced with the first capture,
    943 // `\2` with the second, etc. `\0` is substituted with the entire substring that
    944 // was matched. `\\` is replaced with a literal backslash. The caller must free
    945 // the return value.
    946 //
    947 // An error is only returned if 'targetstr' isn't formatted correctly.
    948 export fn replace(re: *regex, string: str, targetstr: str) (str | error) = {
    949 	return replacen(re, string, targetstr, types::SIZE_MAX);
    950 };
    951 
    952 // Replaces up to 'n' non-overlapping matches of a regular expression against a
    953 // string with 'targetstr', in the same manner as [[replace]]. The caller must
    954 // free the return value.
    955 export fn replacen(
    956 	re: *regex,
    957 	string: str,
    958 	targetstr: str,
    959 	n: size,
    960 ) (str | error) = {
    961 	const target = parse_replace_target(targetstr)?;
    962 	defer free(target);
    963 	// Check if n == 0 after parse_replace_target so errors are propagated
    964 	if (n == 0) {
    965 		return strings::dup(string);
    966 	};
    967 
    968 	const matches = findall(re, string);
    969 	if (len(matches) == 0) {
    970 		return strings::dup(string);
    971 	};
    972 	defer result_freeall(matches);
    973 
    974 	const bytes = strings::toutf8(string);
    975 	let buf = alloc(bytes[..matches[0][0].start_bytesize]...)!;
    976 
    977 	const n = if (len(matches) > n) n else len(matches);
    978 	for (let i = 0z; i < n; i += 1) {
    979 		for (let j = 0z; j < len(target); j += 1) {
    980 			match (target[j]) {
    981 			case let b: []u8 =>
    982 				append(buf, b...)!;
    983 			case let z: size =>
    984 				if (z >= len(matches[i])) yield;
    985 				const b = strings::toutf8(matches[i][z].content);
    986 				append(buf, b...)!;
    987 			};
    988 		};
    989 		const start = matches[i][0].end_bytesize;
    990 		const end = if (i == n - 1) len(bytes)
    991 			else matches[i + 1][0].start_bytesize;
    992 		append(buf, bytes[start..end]...)!;
    993 	};
    994 
    995 	return strings::fromutf8(buf)!;
    996 };
    997 
    998 fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = {
    999 	const bytes = strings::toutf8(targetstr);
   1000 	let target: []([]u8 | size) = alloc([], 1)!;
   1001 	let iter = strings::iter(targetstr);
   1002 	let start = 0z, end = 0z;
   1003 	for (true) match (strings::next(&iter)) {
   1004 	case done =>
   1005 		if (start != end) {
   1006 			append(target, bytes[start..])!;
   1007 		};
   1008 		break;
   1009 	case let r: rune =>
   1010 		if (r == '\\') {
   1011 			if (start != end) {
   1012 				append(target, bytes[start..end])!;
   1013 			};
   1014 
   1015 			const r = match (strings::next(&iter)) {
   1016 			case done =>
   1017 				free(target);
   1018 				return "Trailing backslash": error;
   1019 			case let r: rune =>
   1020 				yield r;
   1021 			};
   1022 
   1023 			if (r == '\\') {
   1024 				append(target, '\\')!;
   1025 			} else if (ascii::isdigit(r)) {
   1026 				append(target, r: u32: size - 0x30)!;
   1027 			} else {
   1028 				free(target);
   1029 				return "Backslash must be followed by positive decimal number or a backslash": error;
   1030 			};
   1031 
   1032 			end += 2;
   1033 			start = end;
   1034 		} else {
   1035 			end += utf8::runesz(r);
   1036 		};
   1037 	};
   1038 
   1039 	return target;
   1040 };
   1041 
   1042 // Replaces all non-overlapping matches of a regular expression against a string
   1043 // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all
   1044 // backslashes are treated literally. The caller must free the return value.
   1045 export fn rawreplace(re: *regex, string: str, targetstr: str) str = {
   1046 	return rawreplacen(re, string, targetstr, types::SIZE_MAX);
   1047 };
   1048 
   1049 // Replaces up to 'n' non-overlapping matches of a regular expression against a
   1050 // string with 'targetstr', in the same manner as [[rawreplace]]. The caller
   1051 // must free the return value.
   1052 export fn rawreplacen(re: *regex, string: str, targetstr: str, n: size) str = {
   1053 	if (n == 0) {
   1054 		return strings::dup(string);
   1055 	};
   1056 
   1057 	const matches = findall(re, string);
   1058 	if (len(matches) == 0) {
   1059 		return strings::dup(string);
   1060 	};
   1061 	defer result_freeall(matches);
   1062 
   1063 	const target = strings::toutf8(targetstr);
   1064 	const bytes = strings::toutf8(string);
   1065 	let buf: []u8 = [];
   1066 
   1067 	append(buf, bytes[..matches[0][0].start_bytesize]...)!;
   1068 	const n = if (len(matches) > n) n else len(matches);
   1069 	for (let i = 1z; i < n; i += 1) {
   1070 		append(buf, target...)!;
   1071 		const start = matches[i - 1][0].end_bytesize;
   1072 		const end = matches[i][0].start_bytesize;
   1073 		append(buf, bytes[start..end]...)!;
   1074 	};
   1075 	append(buf, target...)!;
   1076 	append(buf, bytes[matches[n - 1][0].end_bytesize..]...)!;
   1077 
   1078 	return strings::fromutf8(buf)!;
   1079 };
   1080 
   1081 // Frees a [[result]].
   1082 export fn result_free(s: result) void = {
   1083 	free(s);
   1084 };
   1085 
   1086 // Frees a slice of [[result]]s.
   1087 export fn result_freeall(s: []result) void = {
   1088 	for (let res .. s) {
   1089 		result_free(res);
   1090 	};
   1091 	free(s);
   1092 };
   1093 
   1094 // Converts an [[error]] into a user-friendly string.
   1095 export fn strerror(err: error) str = err;
	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE