regex.ha (29397B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use ascii; 5 use bufio; 6 use encoding::utf8; 7 use io; 8 use memio; 9 use strconv; 10 use strings; 11 use types; 12 13 // An error string describing a compilation error. 14 export type error = !str; 15 16 export type inst_lit = rune, 17 inst_charset = struct { idx: size, is_positive: bool }, 18 inst_any = void, 19 inst_split = size, 20 inst_jump = size, 21 inst_skip = void, 22 inst_match = bool, 23 inst_groupstart = size, 24 inst_groupend = void, 25 inst_repeat = struct { 26 id: size, 27 origin: size, 28 min: (void | size), 29 max: (void | size), 30 }; 31 32 export type inst = (inst_lit | inst_any | inst_split | inst_jump | 33 inst_skip | inst_match | inst_charset | 34 inst_groupstart | inst_groupend | 35 inst_repeat); 36 37 // The resulting match of a [[regex]] applied to a string. 38 // 39 // The first [[capture]] corresponds to the implicit zeroth capture group, 40 // i.e. the whole expression. 41 // 42 // The rest of the [[capture]]s correspond to the rest of the capture groups, 43 // i.e. the sub-expressions. 44 export type result = []capture; 45 46 // A (sub)match corresponding to a regular expression's capture group. 47 export type capture = struct { 48 content: str, 49 start: size, 50 start_bytesize: size, 51 end: size, 52 end_bytesize: size 53 }; 54 55 type thread = struct { 56 pc: size, 57 start_idx: size, 58 start_bytesize: size, 59 root_capture: capture, 60 captures: []capture, 61 rep_counters: []size, 62 matched: bool, 63 failed: bool, 64 }; 65 66 type newmatch = void; 67 68 export type charset = [](charset_lit_item | charset_range_item | 69 charset_class_item), 70 charset_lit_item = rune, 71 charset_range_item = (u32, u32), 72 charset_class_item = (str, *fn(c: rune) bool); 73 74 const charclass_map: [](str, *fn(c: rune) bool) = [ 75 (":alnum:]", &ascii::isalnum), 76 (":alpha:]", &ascii::isalpha), 77 (":blank:]", &ascii::isblank), 78 (":cntrl:]", &ascii::iscntrl), 79 (":digit:]", &ascii::isdigit), 80 (":graph:]", &ascii::isgraph), 81 (":lower:]", &ascii::islower), 82 (":print:]", &ascii::isprint), 83 (":punct:]", &ascii::ispunct), 84 (":space:]", &ascii::isspace), 85 (":upper:]", &ascii::isupper), 86 (":xdigit:]", &ascii::isxdigit), 87 ]; 88 89 export type regex = struct { 90 insts: []inst, 91 charsets: []charset, 92 n_reps: size, 93 }; 94 95 // Frees resources associated with a [[regex]]. 96 export fn finish(re: *regex) void = { 97 free(re.insts); 98 for (let charset .. re.charsets) { 99 free(charset); 100 }; 101 free(re.charsets); 102 }; 103 104 fn find_last_groupstart(insts: []inst) (size | error) = { 105 let nested = 0u; 106 for (let i = len(insts); i > 0; i -= 1) { 107 match (insts[i - 1]) { 108 case inst_groupstart => 109 if (nested == 0) { 110 return i - 1; 111 }; 112 nested -= 1; 113 case inst_groupend => 114 nested += 1; 115 case => void; 116 }; 117 }; 118 return `Unmatched ')'`: error; 119 }; 120 121 // increments all [[inst_jump]] and [[inst_split]] instructions to account for a 122 // newly inserted instruction before the given slice 123 fn shift(sl: []inst) void = { 124 for (let inst &.. sl) { 125 match (*inst) { 126 case let z: inst_jump => 127 *inst = (z + 1): inst_jump; 128 case let z: inst_split => 129 *inst = (z + 1): inst_split; 130 case => void; 131 }; 132 }; 133 }; 134 135 fn handle_bracket( 136 insts: *[]inst, 137 r: rune, 138 r_idx: *size, 139 bracket_idx: *int, 140 iter: *strings::iterator, 141 charsets: *[]charset, 142 skip_charclass_rest: *bool, 143 is_charset_positive: *bool, 144 in_bracket: *bool 145 ) (void | error) = { 146 const peek1 = strings::next(iter); 147 const peek2 = strings::next(iter); 148 const peek3 = strings::next(iter); 149 if (!(peek1 is done)) { 150 strings::prev(iter); 151 }; 152 if (!(peek2 is done)) { 153 strings::prev(iter); 154 }; 155 if (!(peek3 is done)) { 156 strings::prev(iter); 157 }; 158 159 if (*bracket_idx == -1) { 160 append(charsets, [])!; 161 }; 162 *bracket_idx += 1; 163 164 if (*skip_charclass_rest) { 165 if (r == ']') { 166 *skip_charclass_rest = false; 167 }; 168 *r_idx += 1; 169 return; 170 }; 171 172 const is_range = peek1 is rune && peek1 as rune == '-' 173 && !(peek2 is done) && !(peek3 is done) 174 && !(peek2 as rune == ']'); 175 const range_end = peek2; 176 const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 177 && !*is_charset_positive; 178 179 if (r == '\\') { 180 if (peek1 is done) { 181 return `Trailing backslash '\'`: error; 182 } else { 183 append(charsets[len(charsets) - 1], 184 peek1: charset_lit_item)!; 185 strings::next(iter); 186 *r_idx += 1; 187 }; 188 } else if (r == ']' && !is_first_char) { 189 const newinst = inst_charset { 190 idx = len(charsets) - 1, 191 is_positive = *is_charset_positive, 192 }; 193 append(insts, newinst)!; 194 *in_bracket = false; 195 *bracket_idx = -1; 196 *is_charset_positive = true; 197 } else if (r == '^' && *bracket_idx == 0) { 198 *is_charset_positive = false; 199 } else if (r == '[' && !(peek1 is done) 200 && peek1 as rune == ':') { 201 const rest = strings::iterstr(iter); 202 const n_cc = len(charclass_map); 203 for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) { 204 if (strings::hasprefix(rest, charclass_map[cc_idx].0)) { 205 append(charsets[len(charsets) - 1], 206 charclass_map[cc_idx])!; 207 *skip_charclass_rest = true; 208 break; 209 }; 210 }; 211 if (!*skip_charclass_rest) { 212 return `No character class after '[:'`: error; 213 }; 214 } else if (is_range) { 215 const start_b = r: u32; 216 const end_b = range_end as rune: u32; 217 218 if (end_b < start_b) { 219 return `Descending bracket expression range '[z-a]'`: error; 220 }; 221 222 append(charsets[len(charsets) - 1], 223 (start_b, end_b): charset_range_item)!; 224 strings::next(iter); 225 strings::next(iter); 226 *r_idx += 2; 227 } else { 228 append(charsets[len(charsets) - 1], 229 r: charset_lit_item)!; 230 }; 231 232 *r_idx += 1; 233 }; 234 235 // Compiles a regular expression string into a [[regex]]. 236 export fn compile(expr: str) (regex | error) = { 237 let insts: []inst = []; 238 let charsets: []charset = []; 239 let iter = strings::iter(expr); 240 let r_idx = 0z; 241 let jump_idxs: [][]size = []; 242 append(jump_idxs, [])!; 243 defer free(jump_idxs); 244 let in_bracket = false; 245 let skip_charclass_rest = false; 246 let bracket_idx = -1; 247 let is_charset_positive = true; 248 let was_prev_rune_pipe = false; 249 let n_reps = 0z; 250 let group_level = 0z; 251 let capture_idx = 0z; 252 253 for (true) { 254 const next = strings::next(&iter); 255 256 if (r_idx == 0 && next is rune && next: rune != '^') { 257 append(insts, inst_skip)!; 258 }; 259 260 if (in_bracket) { 261 if (next is done) { 262 return `Unmatched '['`: error; 263 }; 264 const r = next: rune; 265 handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter, 266 &charsets, &skip_charclass_rest, 267 &is_charset_positive, 268 &in_bracket)?; 269 continue; 270 }; 271 272 const r = match (next) { 273 case done => 274 if (group_level > 0) { 275 return `Unmatched '('`: error; 276 }; 277 break; 278 case let r: rune => yield r; 279 }; 280 switch (r) { 281 case '\\' => 282 const peek1 = strings::next(&iter); 283 if (peek1 is done) { 284 return `Trailing backslash '\'`: error; 285 } else { 286 append(insts, (peek1 as rune): inst_lit)!; 287 r_idx += 1; 288 }; 289 case '^' => 290 if (group_level > 0) { 291 return `Anchor '^' in capture groups is unsupported`: error; 292 }; 293 if (!(r_idx == 0 || was_prev_rune_pipe)) { 294 return `Anchor '^' not at start of whole pattern or alternation`: error; 295 }; 296 case '$' => 297 if (group_level > 0) { 298 return `Anchor '$' in capture groups is unsupported`: error; 299 }; 300 const peek1 = strings::next(&iter); 301 if (peek1 is rune) { 302 if (peek1 as rune != '|') { 303 return `Anchor '$' not at end of whole pattern or alternation`: error; 304 }; 305 strings::prev(&iter); 306 }; 307 append(insts, true: inst_match)!; 308 case '[' => 309 in_bracket = true; 310 case ']' => 311 append(insts, r: inst_lit)!; 312 case '(' => 313 append(insts, capture_idx: inst_groupstart)!; 314 group_level += 1; 315 capture_idx += 1; 316 for (len(jump_idxs) < group_level + 1) { 317 append(jump_idxs, [])!; 318 }; 319 case ')' => 320 if (group_level == 0) { 321 return `Unmatched ')'`: error; 322 }; 323 append(insts, inst_groupend)!; 324 for (let jump_idx .. jump_idxs[group_level]) { 325 assert(insts[jump_idx] is inst_jump); 326 insts[jump_idx] = (len(insts) - 1): inst_jump; 327 }; 328 delete(jump_idxs[group_level][..]); 329 group_level -= 1; 330 case '|' => 331 append(insts, types::SIZE_MAX: inst_jump)!; 332 const origin = match (find_last_groupstart(insts)) { 333 case error => 334 yield 0z; 335 case let sz: size => 336 yield sz + 1; 337 }; 338 const newinst = (len(insts) + 1): inst_split; 339 // add split after last jump (if any) or at origin 340 const split_idx = if (len(jump_idxs[group_level]) > 0) 341 jump_idxs[group_level][len(jump_idxs[group_level]) - 1] + 1 else origin; 342 insert(insts[split_idx], newinst)!; 343 shift(insts[split_idx + 1..]); 344 // our insertion of our split_idx should never interfere 345 // with an existing jump_idx 346 for (let jump_idx .. jump_idxs[group_level]) { 347 // if this assertion ends up being hit in the 348 // future, it is a sign that jump_idx should be 349 // incremented 350 assert(jump_idx < split_idx, `Found jump_idx interference. Please report this as a bug`); 351 }; 352 append(jump_idxs[group_level], len(insts) - 1)!; 353 // add skip if it's a whole-expression alternation 354 if (origin == 0) { 355 const peek1 = strings::next(&iter); 356 if (peek1 is rune) { 357 if (peek1 as rune != '^') { 358 append(insts, inst_skip)!; 359 }; 360 strings::prev(&iter); 361 }; 362 }; 363 case '{' => 364 let origin = len(insts) - 1; 365 if (insts[origin] is inst_groupend) { 366 origin = find_last_groupstart(insts[..origin])?; 367 }; 368 const rest = strings::iterstr(&iter); 369 const rep_parts = parse_repetition(rest)?; 370 const can_skip = rep_parts.0 == 0; 371 const min = if (rep_parts.0 == 0) { 372 yield 1z; 373 } else { 374 yield rep_parts.0; 375 }; 376 if (can_skip) { 377 // len(insts) - 1 is the current last instruction 378 // len(insts) is the next instruction 379 // advance to len(insts) + 1 to make space for the `inst_split` 380 // advance to len(insts) + 2 to make space for the `inst_repeat` 381 insert(insts[origin], 382 len(insts) + 2: inst_split)!; 383 shift(insts[origin + 1..]); 384 origin += 1; 385 }; 386 const newinst = inst_repeat { 387 id = n_reps, 388 origin = origin, 389 min = min, 390 max = rep_parts.1, 391 }; 392 for (let i = 0z; i <= rep_parts.2; i += 1) { 393 strings::next(&iter); 394 r_idx += 1; 395 }; 396 append(insts, newinst)!; 397 n_reps += 1; 398 case '?' => 399 if (r_idx == 0 || len(insts) == 0) { 400 return `Unused '?'`: error; 401 }; 402 let term_start_idx = len(insts) - 1; 403 match (insts[term_start_idx]) { 404 case (inst_lit | inst_charset | inst_any) => void; 405 case inst_groupend => 406 term_start_idx = find_last_groupstart( 407 insts[..term_start_idx])?; 408 case inst_groupstart => 409 return `Unused '?'`: error; 410 case => 411 return `Misused '?'`: error; 412 }; 413 const after_idx = len(insts) + 1; 414 insert(insts[term_start_idx], after_idx: inst_split)!; 415 shift(insts[term_start_idx + 1..]); 416 case '*' => 417 if (r_idx == 0 || len(insts) == 0) { 418 return `Unused '*'`: error; 419 }; 420 const new_inst_offset = 1z; 421 const jump_idx = len(insts) + new_inst_offset; 422 const after_idx = jump_idx + 1z; 423 let term_start_idx = len(insts) - 1z; 424 match (insts[term_start_idx]) { 425 case (inst_lit | inst_charset | inst_any) => void; 426 case inst_groupend => 427 term_start_idx = find_last_groupstart( 428 insts[..term_start_idx])?; 429 case inst_groupstart => 430 return `Unused '*'`: error; 431 case => 432 return `Misused '*'`: error; 433 }; 434 const split_idx = term_start_idx; 435 term_start_idx += new_inst_offset; 436 insert(insts[split_idx], after_idx: inst_split)!; 437 shift(insts[split_idx + 1..]); 438 append(insts, split_idx: inst_jump)!; 439 case '+' => 440 if (r_idx == 0 || len(insts) == 0) { 441 return `Unused '+'`: error; 442 }; 443 let term_start_idx = len(insts) - 1; 444 match (insts[term_start_idx]) { 445 case (inst_lit | inst_charset | inst_any) => void; 446 case inst_groupend => 447 term_start_idx = find_last_groupstart( 448 insts[..term_start_idx])?; 449 case inst_groupstart => 450 return `Unused '+'`: error; 451 case => 452 return `Misused '+'`: error; 453 }; 454 append(insts, term_start_idx: inst_split)!; 455 case '.' => 456 append(insts, inst_any)!; 457 case => 458 append(insts, r: inst_lit)!; 459 }; 460 was_prev_rune_pipe = (r == '|'); 461 r_idx += 1; 462 }; 463 464 // handle whole expression alternation 465 for (let jump_idx .. jump_idxs[0]) { 466 assert(insts[jump_idx] is inst_jump); 467 insts[jump_idx] = len(insts): inst_jump; 468 }; 469 470 if (len(insts) == 0 || !(insts[len(insts) - 1] is inst_match)) { 471 append(insts, false: inst_match)!; 472 }; 473 474 return regex { 475 insts = insts, 476 charsets = charsets, 477 n_reps = n_reps, 478 }; 479 }; 480 481 fn parse_repetition( 482 s: str 483 ) (((void | size), (void | size), size) | error) = { 484 const first_comma = strings::index(s, ","); 485 const first_endbrace = strings::index(s, "}"); 486 if (first_endbrace is void) { 487 return `Repetition expression syntax error '{n}'`: error; 488 }; 489 const first_endbrace = first_endbrace as size; 490 491 let min_str = ""; 492 let max_str = ""; 493 let is_single_arg = false; 494 if (first_comma is void || first_endbrace < first_comma as size) { 495 const cut = strings::cut(s, "}"); 496 min_str = cut.0; 497 max_str = cut.0; 498 is_single_arg = true; 499 } else { 500 const cut = strings::cut(s, ","); 501 min_str = cut.0; 502 max_str = strings::cut(cut.1, "}").0; 503 }; 504 505 let min: (void | size) = void; 506 let max: (void | size) = void; 507 508 if (len(min_str) > 0) { 509 min = match (strconv::stoi(min_str)) { 510 case let res: int => 511 yield if (res < 0) { 512 return `Negative repetition count '{-n}'`: error; 513 } else { 514 yield res: size; 515 }; 516 case => return `Repetition expression syntax error '{n}'`: error; 517 }; 518 } else { 519 min = 0; 520 }; 521 522 if (len(max_str) > 0) { 523 max = match (strconv::stoi(max_str)) { 524 case let res: int => 525 yield if (res < 0) { 526 return `Negative repetition count '{-n}'`: error; 527 } else { 528 yield res: size; 529 }; 530 case => return `Repetition expression syntax error '{n}'`: error; 531 }; 532 }; 533 534 const rep_len = if (is_single_arg) { 535 yield len(min_str); 536 } else { 537 yield len(min_str) + 1 + len(max_str); 538 }; 539 return (min, max, rep_len); 540 }; 541 542 fn delete_thread(i: size, threads: *[]thread) void = { 543 free(threads[i].captures); 544 free(threads[i].rep_counters); 545 delete(threads[i]); 546 }; 547 548 fn is_consuming_inst(a: inst) bool = { 549 return a is (inst_lit | inst_any | inst_charset); 550 }; 551 552 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = { 553 // Do not add this thread if there is already another thread with 554 // the same PC 555 for (let thread &.. *threads) { 556 if (thread.pc == new_pc && !thread.matched 557 && thread.start_idx 558 < threads[parent_idx].start_idx) { 559 return; 560 }; 561 }; 562 563 append(threads, thread { 564 pc = new_pc, 565 start_idx = threads[parent_idx].start_idx, 566 start_bytesize = threads[parent_idx].start_bytesize, 567 matched = threads[parent_idx].matched, 568 failed = threads[parent_idx].failed, 569 captures = alloc(threads[parent_idx].captures...)!, 570 rep_counters = alloc(threads[parent_idx].rep_counters...)!, 571 ... 572 })!; 573 }; 574 575 fn run_thread( 576 i: size, 577 re: *regex, 578 string: str, 579 threads: *[]thread, 580 r_or_end: (rune | io::EOF), 581 str_idx: size, 582 str_bytesize: size 583 ) (void | newmatch) = { 584 const str_bytes = strings::toutf8(string); 585 if (threads[i].matched) { 586 return; 587 }; 588 for (!is_consuming_inst(re.insts[threads[i].pc])) { 589 match (re.insts[threads[i].pc]) { 590 case inst_lit => abort(); 591 case inst_any => abort(); 592 case inst_split => 593 const new_pc = re.insts[threads[i].pc]: inst_split: size; 594 add_thread(threads, i, new_pc); 595 threads[i].pc += 1; 596 case inst_jump => 597 threads[i].pc = re.insts[threads[i].pc]: inst_jump: size; 598 case inst_skip => 599 const new_pc = threads[i].pc + 1; 600 threads[i].start_idx = str_idx; 601 threads[i].start_bytesize = str_bytesize; 602 add_thread(threads, i, new_pc); 603 break; 604 case let anchored: inst_match => 605 // Do not match if we need an end-anchored match, but we 606 // have not exhausted our string 607 if (anchored && !(r_or_end is io::EOF)) { 608 threads[i].failed = true; 609 return; 610 }; 611 const content = strings::fromutf8_unsafe(str_bytes[ 612 threads[i].start_bytesize..str_bytesize]); 613 threads[i].root_capture = capture { 614 start = threads[i].start_idx, 615 start_bytesize = threads[i].start_bytesize, 616 end = str_idx, 617 end_bytesize = str_bytesize, 618 content = content, 619 }; 620 threads[i].matched = true; 621 return newmatch; 622 case let idx: inst_groupstart => 623 if (idx >= len(threads[i].captures)) { 624 append(threads[i].captures, 625 [capture { ... }...], 626 idx - len(threads[i].captures) + 1)!; 627 }; 628 assert(threads[i].captures[idx].end != types::SIZE_MAX); 629 threads[i].captures[idx] = capture { 630 content = "", 631 start = str_idx, 632 start_bytesize = str_bytesize, 633 // end=types::SIZE_MAX indicates that the 634 // capture group hasn't ended yet 635 end = types::SIZE_MAX, 636 end_bytesize = types::SIZE_MAX, 637 }; 638 threads[i].pc += 1; 639 case inst_groupend => 640 let curr_capture = len(threads[i].captures); 641 for (curr_capture > 0; curr_capture -= 1) { 642 // find inner-most unclosed capture group 643 if (threads[i].captures[curr_capture - 1].end 644 == types::SIZE_MAX) { 645 break; 646 }; 647 }; 648 assert(curr_capture > 0, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`); 649 let capture = &threads[i].captures[curr_capture - 1]; 650 capture.end = str_idx; 651 capture.end_bytesize = str_bytesize; 652 capture.content = strings::fromutf8_unsafe(str_bytes[ 653 capture.start_bytesize..capture.end_bytesize]); 654 threads[i].pc += 1; 655 case let ir: inst_repeat => 656 assert(ir.id < len(threads[i].rep_counters)); 657 threads[i].rep_counters[ir.id] += 1; 658 if (ir.max is size 659 && threads[i].rep_counters[ir.id] 660 > ir.max as size) { 661 threads[i].failed = true; 662 return; 663 }; 664 const new_pc = threads[i].pc + 1; 665 threads[i].pc = ir.origin; 666 if (ir.min is void 667 || threads[i].rep_counters[ir.id] 668 >= ir.min as size) { 669 add_thread(threads, i, new_pc); 670 }; 671 }; 672 }; 673 674 // From now on, we're only matching consuming instructions, and these 675 // can't do anything without another rune. 676 if (r_or_end is io::EOF) { 677 threads[i].failed = true; 678 return; 679 }; 680 681 const r = r_or_end as rune; 682 683 match (re.insts[threads[i].pc]) { 684 case inst_skip => return; 685 case let lit: inst_lit => 686 if (r != lit) { 687 threads[i].failed = true; 688 }; 689 case inst_any => void; 690 case let cs: inst_charset => 691 const charset = re.charsets[cs.idx]; 692 // Disprove the match if we're looking for a negative match 693 // Prove the match if we're looking for a positive match 694 let matched = !cs.is_positive; 695 for (let i = 0z; i < len(charset); i += 1) match (charset[i]) { 696 case let lit: charset_lit_item => 697 if (r == lit) { 698 // Succeeded if positive match 699 // Failed if negative match 700 matched = cs.is_positive; 701 break; 702 }; 703 case let range: charset_range_item => 704 const r_b = r: u32; 705 706 if (r_b >= range.0 && r_b <= range.1) { 707 // Succeeded if positive match 708 // Failed if negative match 709 matched = cs.is_positive; 710 break; 711 }; 712 case let class_item: charset_class_item => 713 const classfn = class_item.1; 714 if (classfn(r)) { 715 // Succeeded if positive match 716 // Failed if negative match 717 matched = cs.is_positive; 718 break; 719 }; 720 }; 721 if (!matched) { 722 threads[i].failed = true; 723 }; 724 case => abort(); // unreachable 725 }; 726 727 threads[i].pc += 1; 728 }; 729 730 // Attempts to match a regular expression against a string and returns the 731 // either the longest leftmost match or all matches. 732 fn search( 733 re: *regex, 734 string: str, 735 handle: io::handle, 736 need_captures: bool 737 ) (void | []capture) = { 738 let threads: []thread = alloc([ 739 thread { captures = [], ... } 740 ])!; 741 if (re.n_reps > 0) { 742 threads[0].rep_counters = alloc([0...], re.n_reps)!; 743 }; 744 defer { 745 for (let i = 0z; i < len(threads); i += 1) { 746 free(threads[i].captures); 747 free(threads[i].rep_counters); 748 }; 749 free(threads); 750 }; 751 752 let str_idx = 0z; 753 let first_match_idx: (void | size) = void; 754 let str_bytesize = 0z; 755 let last_bytesize = 0z; 756 757 const scan = bufio::newscanner(handle); 758 defer bufio::finish(&scan); 759 for (true) { 760 str_bytesize += last_bytesize; 761 762 if (len(threads) == 0) { 763 return void; 764 }; 765 766 let all_matched = true; 767 for (let i = 0z; i < len(threads); i += 1) { 768 if (!threads[i].matched) { 769 all_matched = false; 770 break; 771 }; 772 }; 773 774 if (all_matched) { 775 let best_len = 0z; 776 let best_n_captures = 0z; 777 let best_idx = 0z; 778 for (let i = 0z; i < len(threads); i += 1) { 779 let match_len = threads[i].root_capture.end 780 - threads[i].root_capture.start; 781 const is_better = match_len > best_len 782 || match_len == best_len 783 && len(threads[i].captures) 784 > best_n_captures; 785 if (is_better) { 786 best_len = match_len; 787 best_idx = i; 788 best_n_captures = len(threads[i].captures); 789 }; 790 }; 791 792 // length = number of captures (index of final group + 793 // 1) + root capture 794 let length = 1z; 795 for (let i = len(re.insts); i > 0; i -= 1) { 796 match (re.insts[i - 1]) { 797 case let z: inst_groupstart => 798 length = z + 2; 799 break; 800 case => void; 801 }; 802 }; 803 let res: result = alloc([], length)!; 804 static append(res, threads[best_idx].root_capture)!; 805 static append(res, threads[best_idx].captures...)!; 806 if (length != len(res)) { 807 static append(res, [capture { ... }...], 808 length - len(res))!; 809 }; 810 return res; 811 }; 812 813 const r_or_end = bufio::scan_rune(&scan)!; 814 if (r_or_end is rune) { 815 last_bytesize = utf8::runesz(r_or_end as rune); 816 }; 817 818 for (let i = 0z; i < len(threads); i += 1) { 819 const res = run_thread(i, re, string, &threads, 820 r_or_end, str_idx, str_bytesize); 821 const matchlen = threads[i].root_capture.end 822 - threads[i].root_capture.start; 823 if (res is newmatch && matchlen > 0 && !need_captures) { 824 return []; 825 }; 826 const is_better = res is newmatch && matchlen > 0 827 && (first_match_idx is void 828 || threads[i].start_idx 829 < first_match_idx as size); 830 if (is_better) { 831 first_match_idx = threads[i].start_idx; 832 }; 833 }; 834 str_idx += 1; 835 836 // When we only want the leftmost match, delete all threads that 837 // start after the earliest non-zero-length matched thread 838 if (first_match_idx is size) { 839 for (let thread &.. threads) { 840 if (thread.start_idx > first_match_idx as size) { 841 thread.failed = true; 842 }; 843 }; 844 }; 845 846 // Delete threads that have a PC that has already been 847 // encountered in previous threads. Prioritise threads that 848 // have an earlier start_idx, and threads that were added 849 // earlier. 850 for (let i = 0i64; i < len(threads): i64 - 1; i += 1) { 851 for (let j = i + 1; j < len(threads): i64; j += 1) { 852 const same_pc = threads[i].pc == threads[j].pc; 853 const none_matched = !threads[j].matched 854 && !threads[i].matched; 855 if (same_pc && none_matched) { 856 if (threads[i].start_idx 857 <= threads[j].start_idx) { 858 delete_thread(j: size, &threads); 859 j -= 1; 860 } else { 861 delete_thread(i: size, &threads); 862 i -= 1; 863 break; 864 }; 865 }; 866 }; 867 }; 868 869 for (let i = 0z; i < len(threads); i += 1) { 870 if (threads[i].failed) { 871 delete_thread(i, &threads); 872 i -= 1; 873 }; 874 }; 875 }; 876 }; 877 878 // Returns whether or not a [[regex]] matches any part of a given string. 879 export fn test(re: *regex, string: str) bool = { 880 let strm = memio::fixed(strings::toutf8(string)); 881 return search(re, string, &strm, false) is []capture; 882 }; 883 884 885 // Attempts to match a [[regex]] against a string and returns the longest 886 // leftmost match as a [[result]]. The caller must free the return value with 887 // [[result_free]]. 888 export fn find(re: *regex, string: str) result = { 889 let strm = memio::fixed(strings::toutf8(string)); 890 match (search(re, string, &strm, true)) { 891 case let m: []capture => 892 return m; 893 case void => 894 return []; 895 }; 896 }; 897 898 // Attempts to match a [[regex]] against a string and returns all 899 // non-overlapping matches as a slice of [[result]]s. The caller must free the 900 // return value with [[result_freeall]]. 901 export fn findall(re: *regex, string: str) []result = { 902 let res: []result = []; 903 let str_idx = 0z, str_bytesize = 0z; 904 let strm = memio::fixed(strings::toutf8(string)); 905 const str_bytes = strings::toutf8(string); 906 for (true) { 907 let substring = strings::fromutf8_unsafe( 908 str_bytes[str_bytesize..]); 909 match (search(re, substring, &strm, true)) { 910 case let m: []capture => 911 append(res, m)!; 912 m[0].start += str_idx; 913 m[0].end += str_idx; 914 m[0].start_bytesize += str_bytesize; 915 m[0].end_bytesize += str_bytesize; 916 str_idx = m[0].end; 917 str_bytesize = m[0].end_bytesize; 918 if (m[0].start_bytesize == len(str_bytes)) { 919 // end-of-string reached 920 break; 921 }; 922 if (m[0].start_bytesize == m[0].end_bytesize) { 923 // zero-length match 924 // forward rune and byte indices 925 str_idx += 1; 926 str_bytesize += utf8::utf8sz( 927 str_bytes[str_bytesize])!; 928 }; 929 io::seek(&strm, str_bytesize: io::off, 930 io::whence::SET)!; 931 case void => break; 932 }; 933 }; 934 return res; 935 }; 936 937 // Replaces all non-overlapping matches of a regular expression against a string 938 // with 'targetstr'. 939 // 940 // A backslash followed by a single decimal number within 'targetstr' is 941 // replaced by the capture at that index (starting at 1), or an empty string if 942 // no such capture exists. For example, `\1` is replaced with the first capture, 943 // `\2` with the second, etc. `\0` is substituted with the entire substring that 944 // was matched. `\\` is replaced with a literal backslash. The caller must free 945 // the return value. 946 // 947 // An error is only returned if 'targetstr' isn't formatted correctly. 948 export fn replace(re: *regex, string: str, targetstr: str) (str | error) = { 949 return replacen(re, string, targetstr, types::SIZE_MAX); 950 }; 951 952 // Replaces up to 'n' non-overlapping matches of a regular expression against a 953 // string with 'targetstr', in the same manner as [[replace]]. The caller must 954 // free the return value. 955 export fn replacen( 956 re: *regex, 957 string: str, 958 targetstr: str, 959 n: size, 960 ) (str | error) = { 961 const target = parse_replace_target(targetstr)?; 962 defer free(target); 963 // Check if n == 0 after parse_replace_target so errors are propagated 964 if (n == 0) { 965 return strings::dup(string); 966 }; 967 968 const matches = findall(re, string); 969 if (len(matches) == 0) { 970 return strings::dup(string); 971 }; 972 defer result_freeall(matches); 973 974 const bytes = strings::toutf8(string); 975 let buf = alloc(bytes[..matches[0][0].start_bytesize]...)!; 976 977 const n = if (len(matches) > n) n else len(matches); 978 for (let i = 0z; i < n; i += 1) { 979 for (let j = 0z; j < len(target); j += 1) { 980 match (target[j]) { 981 case let b: []u8 => 982 append(buf, b...)!; 983 case let z: size => 984 if (z >= len(matches[i])) yield; 985 const b = strings::toutf8(matches[i][z].content); 986 append(buf, b...)!; 987 }; 988 }; 989 const start = matches[i][0].end_bytesize; 990 const end = if (i == n - 1) len(bytes) 991 else matches[i + 1][0].start_bytesize; 992 append(buf, bytes[start..end]...)!; 993 }; 994 995 return strings::fromutf8(buf)!; 996 }; 997 998 fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = { 999 const bytes = strings::toutf8(targetstr); 1000 let target: []([]u8 | size) = alloc([], 1)!; 1001 let iter = strings::iter(targetstr); 1002 let start = 0z, end = 0z; 1003 for (true) match (strings::next(&iter)) { 1004 case done => 1005 if (start != end) { 1006 append(target, bytes[start..])!; 1007 }; 1008 break; 1009 case let r: rune => 1010 if (r == '\\') { 1011 if (start != end) { 1012 append(target, bytes[start..end])!; 1013 }; 1014 1015 const r = match (strings::next(&iter)) { 1016 case done => 1017 free(target); 1018 return "Trailing backslash": error; 1019 case let r: rune => 1020 yield r; 1021 }; 1022 1023 if (r == '\\') { 1024 append(target, '\\')!; 1025 } else if (ascii::isdigit(r)) { 1026 append(target, r: u32: size - 0x30)!; 1027 } else { 1028 free(target); 1029 return "Backslash must be followed by positive decimal number or a backslash": error; 1030 }; 1031 1032 end += 2; 1033 start = end; 1034 } else { 1035 end += utf8::runesz(r); 1036 }; 1037 }; 1038 1039 return target; 1040 }; 1041 1042 // Replaces all non-overlapping matches of a regular expression against a string 1043 // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all 1044 // backslashes are treated literally. The caller must free the return value. 1045 export fn rawreplace(re: *regex, string: str, targetstr: str) str = { 1046 return rawreplacen(re, string, targetstr, types::SIZE_MAX); 1047 }; 1048 1049 // Replaces up to 'n' non-overlapping matches of a regular expression against a 1050 // string with 'targetstr', in the same manner as [[rawreplace]]. The caller 1051 // must free the return value. 1052 export fn rawreplacen(re: *regex, string: str, targetstr: str, n: size) str = { 1053 if (n == 0) { 1054 return strings::dup(string); 1055 }; 1056 1057 const matches = findall(re, string); 1058 if (len(matches) == 0) { 1059 return strings::dup(string); 1060 }; 1061 defer result_freeall(matches); 1062 1063 const target = strings::toutf8(targetstr); 1064 const bytes = strings::toutf8(string); 1065 let buf: []u8 = []; 1066 1067 append(buf, bytes[..matches[0][0].start_bytesize]...)!; 1068 const n = if (len(matches) > n) n else len(matches); 1069 for (let i = 1z; i < n; i += 1) { 1070 append(buf, target...)!; 1071 const start = matches[i - 1][0].end_bytesize; 1072 const end = matches[i][0].start_bytesize; 1073 append(buf, bytes[start..end]...)!; 1074 }; 1075 append(buf, target...)!; 1076 append(buf, bytes[matches[n - 1][0].end_bytesize..]...)!; 1077 1078 return strings::fromutf8(buf)!; 1079 }; 1080 1081 // Frees a [[result]]. 1082 export fn result_free(s: result) void = { 1083 free(s); 1084 }; 1085 1086 // Frees a slice of [[result]]s. 1087 export fn result_freeall(s: []result) void = { 1088 for (let res .. s) { 1089 result_free(res); 1090 }; 1091 free(s); 1092 }; 1093 1094 // Converts an [[error]] into a user-friendly string. 1095 export fn strerror(err: error) str = err;