regex.ha (26616B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use ascii; 5 use bufio; 6 use encoding::utf8; 7 use io; 8 use memio; 9 use strconv; 10 use strings; 11 use types; 12 13 // An error string describing a compilation error. 14 export type error = !str; 15 16 export type inst_lit = rune, 17 inst_charset = struct { idx: size, is_positive: bool }, 18 inst_any = void, 19 inst_split = size, 20 inst_jump = size, 21 inst_skip = void, 22 inst_match = bool, 23 inst_groupstart = void, 24 inst_groupend = void, 25 inst_repeat = struct { 26 id: size, 27 origin: size, 28 min: (void | size), 29 max: (void | size), 30 }; 31 32 export type inst = (inst_lit | inst_any | inst_split | inst_jump | 33 inst_skip | inst_match | inst_charset | 34 inst_groupstart | inst_groupend | 35 inst_repeat); 36 37 // The resulting match of a [[regex]] applied to a string. 38 // 39 // The first [[capture]] corresponds to the implicit zeroth capture group, 40 // i.e. the whole expression. 41 // 42 // The rest of the [[capture]]s correspond to the rest of the capture groups, 43 // i.e. the sub-expressions. 44 export type result = []capture; 45 46 // A (sub)match corresponding to a regular expression's capture group. 47 export type capture = struct { 48 content: str, 49 start: size, 50 start_bytesize: size, 51 end: size, 52 end_bytesize: size 53 }; 54 55 type thread = struct { 56 pc: size, 57 start_idx: size, 58 start_bytesize: size, 59 root_capture: capture, 60 captures: []capture, 61 curr_capture: capture, 62 curr_capture_inited: bool, 63 rep_counters: []size, 64 matched: bool, 65 failed: bool, 66 }; 67 68 type newmatch = void; 69 70 export type charclass = enum { 71 ALNUM, ALPHA, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT, SPACE, 72 UPPER, XDIGIT, 73 }; 74 export type charset = [](charset_lit_item | charset_range_item | 75 charset_class_item), 76 charset_lit_item = rune, 77 charset_range_item = (u32, u32), 78 charset_class_item = *fn(c: rune) bool; 79 80 const charclass_map: [](str, *fn(c: rune) bool) = [ 81 (":alnum:]", &ascii::isalnum), 82 (":alpha:]", &ascii::isalpha), 83 (":blank:]", &ascii::isblank), 84 (":cntrl:]", &ascii::iscntrl), 85 (":digit:]", &ascii::isdigit), 86 (":graph:]", &ascii::isgraph), 87 (":lower:]", &ascii::islower), 88 (":print:]", &ascii::isprint), 89 (":punct:]", &ascii::ispunct), 90 (":space:]", &ascii::isspace), 91 (":upper:]", &ascii::isupper), 92 (":xdigit:]", &ascii::isxdigit), 93 ]; 94 95 export type regex = struct { 96 insts: []inst, 97 charsets: []charset, 98 n_reps: size, 99 }; 100 101 // Frees resources associated with a [[regex]]. 102 export fn finish(re: *regex) void = { 103 free(re.insts); 104 for (let charset .. re.charsets) { 105 free(charset); 106 }; 107 free(re.charsets); 108 }; 109 110 fn find_last_groupstart(insts: *[]inst) (size | error) = { 111 for (let i = len(insts); i > 0; i -= 1) { 112 if (insts[i - 1] is inst_groupstart) { 113 return i - 1; 114 }; 115 }; 116 return `Unmatched ')'`: error; 117 }; 118 119 fn handle_bracket( 120 insts: *[]inst, 121 r: rune, 122 r_idx: *size, 123 bracket_idx: *int, 124 iter: *strings::iterator, 125 charsets: *[]charset, 126 skip_charclass_rest: *bool, 127 is_charset_positive: *bool, 128 in_bracket: *bool 129 ) (void | error) = { 130 const peek1 = strings::next(iter); 131 const peek2 = strings::next(iter); 132 const peek3 = strings::next(iter); 133 if (!(peek1 is done)) { 134 strings::prev(iter); 135 }; 136 if (!(peek2 is done)) { 137 strings::prev(iter); 138 }; 139 if (!(peek3 is done)) { 140 strings::prev(iter); 141 }; 142 143 if (*bracket_idx == -1) { 144 append(charsets, []); 145 }; 146 *bracket_idx += 1; 147 148 if (*skip_charclass_rest) { 149 if (r == ']') { 150 *skip_charclass_rest = false; 151 }; 152 *r_idx += 1; 153 return; 154 }; 155 156 const is_range = peek1 is rune && peek1 as rune == '-' 157 && !(peek2 is done) && !(peek3 is done) 158 && !(peek2 as rune == ']'); 159 const range_end = peek2; 160 const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 161 && !*is_charset_positive; 162 163 if (r == '\\') { 164 if (peek1 is done) { 165 return `Trailing backslash '\'`: error; 166 } else { 167 append(charsets[len(charsets) - 1], 168 peek1: charset_lit_item); 169 strings::next(iter); 170 *r_idx += 1; 171 }; 172 } else if (r == ']' && !is_first_char) { 173 const newinst = inst_charset { 174 idx = len(charsets) - 1, 175 is_positive = *is_charset_positive, 176 }; 177 append(insts, newinst); 178 *in_bracket = false; 179 *bracket_idx = -1; 180 *is_charset_positive = true; 181 } else if (r == '^' && *bracket_idx == 0) { 182 *is_charset_positive = false; 183 } else if (r == '[' && !(peek1 is done) 184 && peek1 as rune == ':') { 185 const rest = strings::iterstr(iter); 186 const n_cc = len(charclass_map); 187 for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) { 188 if (strings::hasprefix(rest, charclass_map[cc_idx].0)) { 189 append(charsets[len(charsets) - 1], 190 charclass_map[cc_idx].1); 191 *skip_charclass_rest = true; 192 break; 193 }; 194 }; 195 if (!*skip_charclass_rest) { 196 return `No character class after '[:'`: error; 197 }; 198 } else if (is_range) { 199 const start_b = r: u32; 200 const end_b = range_end as rune: u32; 201 202 if (end_b < start_b) { 203 return `Descending bracket expression range '[z-a]'`: error; 204 }; 205 206 append(charsets[len(charsets) - 1], 207 (start_b, end_b): charset_range_item); 208 strings::next(iter); 209 strings::next(iter); 210 *r_idx += 2; 211 } else { 212 append(charsets[len(charsets) - 1], 213 r: charset_lit_item); 214 }; 215 216 *r_idx += 1; 217 }; 218 219 // Compiles a regular expression string into a [[regex]]. 220 export fn compile(expr: str) (regex | error) = { 221 let insts: []inst = []; 222 let charsets: []charset = []; 223 let iter = strings::iter(expr); 224 let r_idx = 0z; 225 let anchored = false; 226 let jump_idxs: []size = []; 227 let in_bracket = false; 228 let skip_charclass_rest = false; 229 let bracket_idx = -1; 230 let is_charset_positive = true; 231 let n_reps = 0z; 232 let n_groupstarts = 0; 233 234 for (true) { 235 const next = strings::next(&iter); 236 237 if (r_idx == 0 && next is rune && next: rune != '^') { 238 append(insts, inst_skip); 239 }; 240 241 if (in_bracket) { 242 if (next is done) { 243 return `Unmatched '['`: error; 244 }; 245 const r = next: rune; 246 handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter, 247 &charsets, &skip_charclass_rest, 248 &is_charset_positive, 249 &in_bracket)?; 250 continue; 251 }; 252 253 const r = match (next) { 254 case done => 255 if (n_groupstarts > 0) { 256 return `Unmatched '('`: error; 257 }; 258 break; 259 case let r: rune => yield r; 260 }; 261 switch (r) { 262 case '\\' => 263 const peek1 = strings::next(&iter); 264 if (peek1 is done) { 265 return `Trailing backslash '\'`: error; 266 } else { 267 append(insts, (peek1 as rune): inst_lit); 268 r_idx += 1; 269 }; 270 case '^' => 271 if (r_idx != 0) { 272 return `Anchor '^' not at start`: error; 273 }; 274 case '$' => 275 if (r_idx != len(expr) - 1) { 276 return `Anchor '$' not at end`: error; 277 }; 278 anchored = true; 279 case '[' => 280 in_bracket = true; 281 case ']' => 282 append(insts, r: inst_lit); 283 case '(' => 284 if (n_groupstarts > 0) { 285 return `Nested capture groups are unsupported`: error; 286 }; 287 append(insts, inst_groupstart); 288 n_groupstarts += 1; 289 case ')' => 290 if (n_groupstarts == 0) { 291 return `Unmatched ')'`: error; 292 }; 293 n_groupstarts -= 1; 294 append(insts, inst_groupend); 295 for (let jump_idx .. jump_idxs) { 296 assert(insts[jump_idx] is inst_jump); 297 insts[jump_idx] = (len(insts) - 1): inst_jump; 298 }; 299 jump_idxs = []; 300 case '|' => 301 append(insts, types::SIZE_MAX: inst_jump); 302 const origin = match (find_last_groupstart(&insts)) { 303 case error => 304 yield 0z; 305 case let sz: size => 306 yield sz + 1; 307 }; 308 const newinst = (len(insts) + 1): inst_split; 309 // add split after last jump (if any) or at origin 310 const split_idx = if (len(jump_idxs) > 0) 311 jump_idxs[len(jump_idxs) - 1] + 1 else origin; 312 insert(insts[split_idx], newinst); 313 append(jump_idxs, len(insts) - 1); 314 case '{' => 315 let origin = len(insts) - 1; 316 if (insts[origin] is inst_groupend) { 317 origin = find_last_groupstart(&insts)?; 318 }; 319 const rest = strings::iterstr(&iter); 320 const rep_parts = parse_repetition(rest)?; 321 const can_skip = rep_parts.0 == 0; 322 const min = if (rep_parts.0 == 0) { 323 yield 1z; 324 } else { 325 yield rep_parts.0; 326 }; 327 if (can_skip) { 328 insert(insts[origin], 329 len(insts) + 2: inst_split); 330 origin += 1; 331 }; 332 const newinst = inst_repeat { 333 id = n_reps, 334 origin = origin, 335 min = min, 336 max = rep_parts.1, 337 }; 338 for (let i = 0z; i <= rep_parts.2; i += 1) { 339 strings::next(&iter); 340 r_idx += 1; 341 }; 342 append(insts, newinst); 343 n_reps += 1; 344 case '?' => 345 if (r_idx == 0 || len(insts) == 0) { 346 return `Unused '?'`: error; 347 }; 348 let term_start_idx = len(insts) - 1; 349 match (insts[term_start_idx]) { 350 case (inst_lit | inst_charset | inst_any) => void; 351 case inst_groupend => 352 term_start_idx = find_last_groupstart(&insts)?; 353 case inst_groupstart => 354 return `Unused '?'`: error; 355 case => 356 return `Misused '?'`: error; 357 }; 358 const after_idx = len(insts) + 1; 359 insert(insts[term_start_idx], after_idx: inst_split); 360 case '*' => 361 if (r_idx == 0 || len(insts) == 0) { 362 return `Unused '*'`: error; 363 }; 364 const new_inst_offset = 1z; 365 const jump_idx = len(insts) + new_inst_offset; 366 const after_idx = jump_idx + 1z; 367 let term_start_idx = len(insts) - 1z; 368 match (insts[term_start_idx]) { 369 case (inst_lit | inst_charset | inst_any) => void; 370 case inst_groupend => 371 term_start_idx = find_last_groupstart(&insts)?; 372 case inst_groupstart => 373 return `Unused '*'`: error; 374 case => 375 return `Misused '*'`: error; 376 }; 377 const split_idx = term_start_idx; 378 term_start_idx += new_inst_offset; 379 insert(insts[split_idx], after_idx: inst_split); 380 append(insts, split_idx: inst_jump); 381 case '+' => 382 if (r_idx == 0 || len(insts) == 0) { 383 return `Unused '+'`: error; 384 }; 385 let term_start_idx = len(insts) - 1; 386 match (insts[term_start_idx]) { 387 case (inst_lit | inst_charset | inst_any) => void; 388 case inst_groupend => 389 term_start_idx = find_last_groupstart(&insts)?; 390 case inst_groupstart => 391 return `Unused '+'`: error; 392 case => 393 return `Misused '+'`: error; 394 }; 395 append(insts, term_start_idx: inst_split); 396 case '.' => 397 append(insts, inst_any); 398 case => 399 append(insts, r: inst_lit); 400 }; 401 r_idx += 1; 402 }; 403 404 // handle whole expression alternation 405 for (let jump_idx .. jump_idxs) { 406 assert(insts[jump_idx] is inst_jump); 407 insts[jump_idx] = len(insts): inst_jump; 408 }; 409 jump_idxs = []; 410 411 append(insts, anchored: inst_match); 412 413 return regex { 414 insts = insts, 415 charsets = charsets, 416 n_reps = n_reps, 417 }; 418 }; 419 420 fn parse_repetition( 421 s: str 422 ) (((void | size), (void | size), size) | error) = { 423 const first_comma = strings::index(s, ","); 424 const first_endbrace = strings::index(s, "}"); 425 if (first_endbrace is void) { 426 return `Repetition expression syntax error '{n}'`: error; 427 }; 428 const first_endbrace = first_endbrace as size; 429 430 let min_str = ""; 431 let max_str = ""; 432 let is_single_arg = false; 433 if (first_comma is void || first_endbrace < first_comma as size) { 434 const cut = strings::cut(s, "}"); 435 min_str = cut.0; 436 max_str = cut.0; 437 is_single_arg = true; 438 } else { 439 const cut = strings::cut(s, ","); 440 min_str = cut.0; 441 max_str = strings::cut(cut.1, "}").0; 442 }; 443 444 let min: (void | size) = void; 445 let max: (void | size) = void; 446 447 if (len(min_str) > 0) { 448 min = match (strconv::stoi(min_str)) { 449 case let res: int => 450 yield if (res < 0) { 451 return `Negative repetition count '{-n}'`: error; 452 } else { 453 yield res: size; 454 }; 455 case => return `Repetition expression syntax error '{n}'`: error; 456 }; 457 } else { 458 min = 0; 459 }; 460 461 if (len(max_str) > 0) { 462 max = match (strconv::stoi(max_str)) { 463 case let res: int => 464 yield if (res < 0) { 465 return `Negative repetition count '{-n}'`: error; 466 } else { 467 yield res: size; 468 }; 469 case => return `Repetition expression syntax error '{n}'`: error; 470 }; 471 }; 472 473 const rep_len = if (is_single_arg) { 474 yield len(min_str); 475 } else { 476 yield len(min_str) + 1 + len(max_str); 477 }; 478 return (min, max, rep_len); 479 }; 480 481 fn delete_thread(i: size, threads: *[]thread) void = { 482 free(threads[i].captures); 483 free(threads[i].rep_counters); 484 delete(threads[i]); 485 }; 486 487 fn is_consuming_inst(a: inst) bool = { 488 return a is (inst_lit | inst_any | inst_charset); 489 }; 490 491 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = { 492 // Do not add this thread if there is already another thread with 493 // the same PC 494 for (let thread &.. *threads) { 495 if (thread.pc == new_pc && !thread.matched 496 && thread.start_idx 497 < threads[parent_idx].start_idx) { 498 return; 499 }; 500 }; 501 502 append(threads, thread { 503 pc = new_pc, 504 start_idx = threads[parent_idx].start_idx, 505 start_bytesize = threads[parent_idx].start_bytesize, 506 curr_capture = threads[parent_idx].curr_capture, 507 curr_capture_inited = threads[parent_idx].curr_capture_inited, 508 matched = threads[parent_idx].matched, 509 failed = threads[parent_idx].failed, 510 captures = alloc(threads[parent_idx].captures...), 511 rep_counters = alloc(threads[parent_idx].rep_counters...), 512 ... 513 }); 514 }; 515 516 fn run_thread( 517 i: size, 518 re: *regex, 519 string: str, 520 threads: *[]thread, 521 r_or_end: (rune | io::EOF), 522 str_idx: size, 523 str_bytesize: size 524 ) (void | newmatch) = { 525 const str_bytes = strings::toutf8(string); 526 if (threads[i].matched) { 527 return; 528 }; 529 for (!is_consuming_inst(re.insts[threads[i].pc])) { 530 match (re.insts[threads[i].pc]) { 531 case inst_lit => abort(); 532 case inst_any => abort(); 533 case inst_split => 534 const new_pc = re.insts[threads[i].pc]: inst_split: size; 535 add_thread(threads, i, new_pc); 536 threads[i].pc += 1; 537 case inst_jump => 538 threads[i].pc = re.insts[threads[i].pc]: inst_jump: size; 539 case inst_skip => 540 const new_pc = threads[i].pc + 1; 541 threads[i].start_idx = str_idx; 542 threads[i].start_bytesize = str_bytesize; 543 add_thread(threads, i, new_pc); 544 break; 545 case let anchored: inst_match => 546 // Do not match if we need an end-anchored match, but we 547 // have not exhausted our string 548 if (anchored && !(r_or_end is io::EOF)) { 549 threads[i].failed = true; 550 return; 551 }; 552 const content = strings::fromutf8_unsafe(str_bytes[ 553 threads[i].start_bytesize..str_bytesize]); 554 threads[i].root_capture = capture { 555 start = threads[i].start_idx, 556 start_bytesize = threads[i].start_bytesize, 557 end = str_idx, 558 end_bytesize = str_bytesize, 559 content = content, 560 }; 561 threads[i].matched = true; 562 return newmatch; 563 case inst_groupstart => 564 assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported"); 565 threads[i].curr_capture.start = str_idx; 566 threads[i].curr_capture.start_bytesize = str_bytesize; 567 threads[i].curr_capture_inited = true; 568 threads[i].pc += 1; 569 case inst_groupend => 570 assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`); 571 threads[i].curr_capture.end = str_idx; 572 threads[i].curr_capture.end_bytesize = str_bytesize; 573 threads[i].curr_capture.content = 574 strings::fromutf8_unsafe(str_bytes[ 575 threads[i].curr_capture.start_bytesize.. 576 threads[i].curr_capture.end_bytesize]); 577 append(threads[i].captures, threads[i].curr_capture); 578 threads[i].curr_capture = capture { ... }; 579 threads[i].curr_capture_inited = false; 580 threads[i].pc += 1; 581 case let ir: inst_repeat => 582 assert(ir.id < len(threads[i].rep_counters)); 583 threads[i].rep_counters[ir.id] += 1; 584 if (ir.max is size 585 && threads[i].rep_counters[ir.id] 586 > ir.max as size) { 587 threads[i].failed = true; 588 return; 589 }; 590 const new_pc = threads[i].pc + 1; 591 threads[i].pc = ir.origin; 592 if (ir.min is void 593 || threads[i].rep_counters[ir.id] 594 >= ir.min as size) { 595 add_thread(threads, i, new_pc); 596 }; 597 }; 598 }; 599 600 // From now on, we're only matching consuming instructions, and these 601 // can't do anything without another rune. 602 if (r_or_end is io::EOF) { 603 threads[i].failed = true; 604 return; 605 }; 606 607 const r = r_or_end as rune; 608 609 match (re.insts[threads[i].pc]) { 610 case inst_skip => return; 611 case let lit: inst_lit => 612 if (r != lit) { 613 threads[i].failed = true; 614 }; 615 case inst_any => void; 616 case let cs: inst_charset => 617 const charset = re.charsets[cs.idx]; 618 // Disprove the match if we're looking for a negative match 619 // Prove the match if we're looking for a positive match 620 let matched = !cs.is_positive; 621 for (let i = 0z; i < len(charset); i += 1) match (charset[i]) { 622 case let lit: charset_lit_item => 623 if (r == lit) { 624 // Succeeded if positive match 625 // Failed if negative match 626 matched = cs.is_positive; 627 break; 628 }; 629 case let range: charset_range_item => 630 const r_b = r: u32; 631 632 if (r_b >= range.0 && r_b <= range.1) { 633 // Succeeded if positive match 634 // Failed if negative match 635 matched = cs.is_positive; 636 break; 637 }; 638 case let classfn: charset_class_item => 639 if (classfn(r)) { 640 // Succeeded if positive match 641 // Failed if negative match 642 matched = cs.is_positive; 643 break; 644 }; 645 }; 646 if (!matched) { 647 threads[i].failed = true; 648 }; 649 }; 650 651 threads[i].pc += 1; 652 }; 653 654 // Attempts to match a regular expression against a string and returns the 655 // either the longest leftmost match or all matches. 656 fn search( 657 re: *regex, 658 string: str, 659 handle: io::handle, 660 need_captures: bool 661 ) (void | []capture) = { 662 let threads: []thread = alloc([ 663 thread { captures = [], ... } 664 ]); 665 if (re.n_reps > 0) { 666 threads[0].rep_counters = alloc([0...], re.n_reps); 667 }; 668 defer { 669 for (let i = 0z; i < len(threads); i += 1) { 670 free(threads[i].captures); 671 free(threads[i].rep_counters); 672 }; 673 free(threads); 674 }; 675 676 let str_idx = 0z; 677 let first_match_idx: (void | size) = void; 678 let str_bytesize = 0z; 679 let last_bytesize = 0z; 680 681 for (true) { 682 str_bytesize += last_bytesize; 683 684 if (len(threads) == 0) { 685 return void; 686 }; 687 688 let all_matched = true; 689 for (let i = 0z; i < len(threads); i += 1) { 690 if (!threads[i].matched) { 691 all_matched = false; 692 break; 693 }; 694 }; 695 696 if (all_matched) { 697 let best_len = 0z; 698 let best_n_captures = 0z; 699 let best_idx = 0z; 700 for (let i = 0z; i < len(threads); i += 1) { 701 let match_len = threads[i].root_capture.end 702 - threads[i].root_capture.start; 703 const is_better = match_len > best_len 704 || match_len == best_len 705 && len(threads[i].captures) 706 > best_n_captures; 707 if (is_better) { 708 best_len = match_len; 709 best_idx = i; 710 best_n_captures = len(threads[i].captures); 711 }; 712 }; 713 let res: []capture = alloc([], 714 len(threads[best_idx].captures) + 1); 715 append(res, threads[best_idx].root_capture); 716 append(res, threads[best_idx].captures...); 717 return res; 718 }; 719 720 const r_or_end = bufio::read_rune(handle)!; 721 if (r_or_end is rune) { 722 last_bytesize = utf8::runesz(r_or_end as rune); 723 }; 724 725 for (let i = 0z; i < len(threads); i += 1) { 726 const res = run_thread(i, re, string, &threads, 727 r_or_end, str_idx, str_bytesize); 728 const matchlen = threads[i].root_capture.end 729 - threads[i].root_capture.start; 730 if (res is newmatch && matchlen > 0 && !need_captures) { 731 return []; 732 }; 733 const is_better = res is newmatch && matchlen > 0 734 && (first_match_idx is void 735 || threads[i].start_idx 736 < first_match_idx as size); 737 if (is_better) { 738 first_match_idx = threads[i].start_idx; 739 }; 740 }; 741 str_idx += 1; 742 743 // When we only want the leftmost match, delete all threads that 744 // start after the earliest non-zero-length matched thread 745 if (first_match_idx is size) { 746 for (let thread &.. threads) { 747 if (thread.start_idx > first_match_idx as size) { 748 thread.failed = true; 749 }; 750 }; 751 }; 752 753 // Delete threads that have a PC that has already been 754 // encountered in previous threads. Prioritise threads that 755 // have an earlier start_idx, and threads that were added 756 // earlier. 757 for (let i = 0i64; i < len(threads): i64 - 1; i += 1) { 758 for (let j = i + 1; j < len(threads): i64; j += 1) { 759 const same_pc = threads[i].pc == threads[j].pc; 760 const none_matched = !threads[j].matched 761 && !threads[i].matched; 762 if (same_pc && none_matched) { 763 if (threads[i].start_idx 764 <= threads[j].start_idx) { 765 delete_thread(j: size, &threads); 766 j -= 1; 767 } else { 768 delete_thread(i: size, &threads); 769 i -= 1; 770 break; 771 }; 772 }; 773 }; 774 }; 775 776 for (let i = 0z; i < len(threads); i += 1) { 777 if (threads[i].failed) { 778 delete_thread(i, &threads); 779 i -= 1; 780 }; 781 }; 782 }; 783 }; 784 785 // Returns whether or not a [[regex]] matches any part of a given string. 786 export fn test(re: *regex, string: str) bool = { 787 let strm = memio::fixed(strings::toutf8(string)); 788 return search(re, string, &strm, false) is []capture; 789 }; 790 791 792 // Attempts to match a [[regex]] against a string and returns the longest 793 // leftmost match as a [[result]]. The caller must free the return value with 794 // [[result_free]]. 795 export fn find(re: *regex, string: str) result = { 796 let strm = memio::fixed(strings::toutf8(string)); 797 match (search(re, string, &strm, true)) { 798 case let m: []capture => 799 return m; 800 case void => 801 return []; 802 }; 803 }; 804 805 // Attempts to match a [[regex]] against a string and returns all 806 // non-overlapping matches as a slice of [[result]]s. The caller must free the 807 // return value with [[result_freeall]]. 808 export fn findall(re: *regex, string: str) []result = { 809 let res: []result = []; 810 let str_idx = 0z, str_bytesize = 0z; 811 let strm = memio::fixed(strings::toutf8(string)); 812 const str_bytes = strings::toutf8(string); 813 for (true) { 814 let substring = strings::fromutf8_unsafe( 815 str_bytes[str_bytesize..]); 816 match (search(re, substring, &strm, true)) { 817 case let m: []capture => 818 append(res, m); 819 m[0].start += str_idx; 820 m[0].end += str_idx; 821 m[0].start_bytesize += str_bytesize; 822 m[0].end_bytesize += str_bytesize; 823 str_idx = m[0].end; 824 str_bytesize = m[0].end_bytesize; 825 if (m[0].start_bytesize == len(str_bytes)) { 826 // end-of-string reached 827 break; 828 }; 829 if (m[0].start_bytesize == m[0].end_bytesize) { 830 // zero-length match 831 // forward rune and byte indices 832 str_idx += 1; 833 str_bytesize += encoding::utf8::utf8sz( 834 str_bytes[str_bytesize])!; 835 }; 836 io::seek(&strm, str_bytesize: io::off, 837 io::whence::SET)!; 838 case void => break; 839 }; 840 }; 841 return res; 842 }; 843 844 // Replaces all non-overlapping matches of a regular expression against a string 845 // with 'targetstr'. 846 // 847 // A backslash followed by a single decimal number within 'targetstr' is 848 // replaced by the capture at that index (starting at 1), or an empty string if 849 // no such capture exists. For example, `\1` is replaced with the first capture, 850 // `\2` with the second, etc. `\0` is substituted with the entire substring that 851 // was matched. `\\` is replaced with a literal backslash. The caller must free 852 // the return value. 853 // 854 // An error is only returned if 'targetstr' isn't formatted correctly. 855 export fn replace(re: *regex, string: str, targetstr: str) (str | error) = { 856 return replacen(re, string, targetstr, types::SIZE_MAX); 857 }; 858 859 // Replaces up to 'n' non-overlapping matches of a regular expression against a 860 // string with 'targetstr', in the same manner as [[replace]]. The caller must 861 // free the return value. 862 export fn replacen( 863 re: *regex, 864 string: str, 865 targetstr: str, 866 n: size, 867 ) (str | error) = { 868 const target = parse_replace_target(targetstr)?; 869 defer free(target); 870 // Check if n == 0 after parse_replace_target so errors are propagated 871 if (n == 0) { 872 return strings::dup(string); 873 }; 874 875 const matches = findall(re, string); 876 if (len(matches) == 0) { 877 return strings::dup(string); 878 }; 879 defer result_freeall(matches); 880 881 const bytes = strings::toutf8(string); 882 let buf = alloc(bytes[..matches[0][0].start_bytesize]...); 883 884 const n = if (len(matches) > n) n else len(matches); 885 for (let i = 0z; i < n; i += 1) { 886 for (let j = 0z; j < len(target); j += 1) { 887 match (target[j]) { 888 case let b: []u8 => 889 append(buf, b...); 890 case let z: size => 891 if (z >= len(matches[i])) yield; 892 const b = strings::toutf8(matches[i][z].content); 893 append(buf, b...); 894 }; 895 }; 896 const start = matches[i][0].end_bytesize; 897 const end = if (i == n - 1) len(bytes) 898 else matches[i + 1][0].start_bytesize; 899 append(buf, bytes[start..end]...); 900 }; 901 902 return strings::fromutf8(buf)!; 903 }; 904 905 fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = { 906 const bytes = strings::toutf8(targetstr); 907 let target: []([]u8 | size) = alloc([], 1); 908 let iter = strings::iter(targetstr); 909 let start = 0z, end = 0z; 910 for (true) match (strings::next(&iter)) { 911 case done => 912 if (start != end) { 913 append(target, bytes[start..]); 914 }; 915 break; 916 case let r: rune => 917 if (r == '\\') { 918 if (start != end) { 919 append(target, bytes[start..end]); 920 }; 921 922 const r = match (strings::next(&iter)) { 923 case done => 924 return "Trailing backslash": error; 925 case let r: rune => 926 yield r; 927 }; 928 929 if (r == '\\') { 930 append(target, '\\'); 931 } else if (ascii::isdigit(r)) { 932 append(target, r: u32: size - 0x30); 933 } else { 934 return "Backslash must be followed by positive decimal number or a backslash": error; 935 }; 936 937 end += 2; 938 start = end; 939 } else { 940 end += utf8::runesz(r); 941 }; 942 }; 943 944 return target; 945 }; 946 947 // Replaces all non-overlapping matches of a regular expression against a string 948 // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all 949 // backslashes are treated literally. The caller must free the return value. 950 export fn rawreplace(re: *regex, string: str, targetstr: str) str = { 951 return rawreplacen(re, string, targetstr, types::SIZE_MAX); 952 }; 953 954 // Replaces up to 'n' non-overlapping matches of a regular expression against a 955 // string with 'targetstr', in the same manner as [[rawreplace]]. The caller 956 // must free the return value. 957 export fn rawreplacen(re: *regex, string: str, targetstr: str, n: size) str = { 958 if (n == 0) { 959 return strings::dup(string); 960 }; 961 962 const matches = findall(re, string); 963 if (len(matches) == 0) { 964 return strings::dup(string); 965 }; 966 defer result_freeall(matches); 967 968 const target = strings::toutf8(targetstr); 969 const bytes = strings::toutf8(string); 970 let buf: []u8 = []; 971 972 append(buf, bytes[..matches[0][0].start_bytesize]...); 973 const n = if (len(matches) > n) n else len(matches); 974 for (let i = 1z; i < n; i += 1) { 975 append(buf, target...); 976 const start = matches[i - 1][0].end_bytesize; 977 const end = matches[i][0].start_bytesize; 978 append(buf, bytes[start..end]...); 979 }; 980 append(buf, target...); 981 append(buf, bytes[matches[n - 1][0].end_bytesize..]...); 982 983 return strings::fromutf8(buf)!; 984 }; 985 986 // Frees a [[result]]. 987 export fn result_free(s: result) void = { 988 free(s); 989 }; 990 991 // Frees a slice of [[result]]s. 992 export fn result_freeall(s: []result) void = { 993 for (let res .. s) { 994 result_free(res); 995 }; 996 free(s); 997 }; 998 999 // Converts an [[error]] into a user-friendly string. 1000 export fn strerror(err: error) str = err;