regex.ha (25116B)
1 // License: MPL-2.0 2 // (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net> 3 use ascii; 4 use bufio; 5 use encoding::utf8; 6 use errors; 7 use io; 8 use strconv; 9 use strings; 10 11 // An error string describing a compilation error. 12 export type error = !str; 13 14 export type inst_lit = rune, 15 inst_charset = struct { idx: size, is_positive: bool }, 16 inst_any = void, 17 inst_split = size, 18 inst_jump = size, 19 inst_skip = void, 20 inst_match = bool, 21 inst_groupstart = void, 22 inst_groupend = void, 23 inst_repeat = struct { 24 id: size, 25 origin: size, 26 min: (void | size), 27 max: (void | size), 28 }; 29 30 export type inst = (inst_lit | inst_any | inst_split | inst_jump | 31 inst_skip | inst_match | inst_charset | 32 inst_groupstart | inst_groupend | 33 inst_repeat); 34 35 // The resulting match of a [[regex]] applied to a string. 36 // 37 // The first [[capture]] corresponds to the implicit zeroth capture group, 38 // i.e. the whole expression. 39 // 40 // The rest of the [[capture]]s correspond to the rest of the capture groups, 41 // i.e. the sub-expressions. 42 export type result = []capture; 43 44 // A (sub)match corresponding to a regular expression's capture group. 45 export type capture = struct { 46 content: str, 47 start: size, 48 start_bytesize: size, 49 end: size, 50 end_bytesize: size 51 }; 52 53 type thread = struct { 54 pc: size, 55 start_idx: size, 56 start_bytesize: size, 57 root_capture: capture, 58 captures: []capture, 59 curr_capture: capture, 60 curr_capture_inited: bool, 61 rep_counters: []size, 62 matched: bool, 63 failed: bool, 64 }; 65 66 type newmatch = void; 67 68 export type charclass = enum { 69 ALNUM, ALPHA, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT, SPACE, 70 UPPER, XDIGIT, 71 }; 72 export type charset = [](charset_lit_item | charset_range_item | 73 charset_class_item), 74 charset_lit_item = rune, 75 charset_range_item = (u32, u32), 76 charset_class_item = *fn(c: rune) bool; 77 78 const charclass_map: [](str, *fn(c: rune) bool) = [ 79 (":alnum:]", &ascii::isalnum), 80 (":alpha:]", &ascii::isalpha), 81 (":blank:]", &ascii::isblank), 82 (":cntrl:]", &ascii::iscntrl), 83 (":digit:]", &ascii::isdigit), 84 (":graph:]", &ascii::isgraph), 85 (":lower:]", &ascii::islower), 86 (":print:]", &ascii::isprint), 87 (":punct:]", &ascii::ispunct), 88 (":space:]", &ascii::isspace), 89 (":upper:]", &ascii::isupper), 90 (":xdigit:]", &ascii::isxdigit), 91 ]; 92 93 export type regex = struct { 94 insts: []inst, 95 charsets: []charset, 96 n_reps: size, 97 }; 98 99 // Frees resources associated with a [[regex]]. 100 export fn finish(re: *regex) void = { 101 free(re.insts); 102 for (let i = 0z; i < len(re.charsets); i += 1) { 103 free(re.charsets[i]); 104 }; 105 free(re.charsets); 106 }; 107 108 fn find_last_groupstart(insts: *[]inst) (size | error) = { 109 for (let i = len(insts); i > 0; i -= 1) { 110 if (insts[i - 1] is inst_groupstart) { 111 return i - 1; 112 }; 113 }; 114 return `Unmatched ')'`: error; 115 }; 116 117 fn handle_bracket( 118 insts: *[]inst, 119 r: rune, 120 r_idx: *size, 121 bracket_idx: *int, 122 iter: *strings::iterator, 123 charsets: *[]charset, 124 skip_charclass_rest: *bool, 125 is_charset_positive: *bool, 126 in_bracket: *bool 127 ) (void | error) = { 128 const peek1 = strings::next(iter); 129 const peek2 = strings::next(iter); 130 const peek3 = strings::next(iter); 131 if (!(peek1 is void)) { 132 strings::prev(iter); 133 }; 134 if (!(peek2 is void)) { 135 strings::prev(iter); 136 }; 137 if (!(peek3 is void)) { 138 strings::prev(iter); 139 }; 140 141 if (*bracket_idx == -1) { 142 append(charsets, []); 143 }; 144 *bracket_idx += 1; 145 146 if (*skip_charclass_rest) { 147 if (r == ']') { 148 *skip_charclass_rest = false; 149 }; 150 *r_idx += 1; 151 return; 152 }; 153 154 const is_range = peek1 is rune && peek1 as rune == '-' 155 && !(peek2 is void) && !(peek3 is void) 156 && !(peek2 as rune == ']'); 157 const range_end = peek2; 158 const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 159 && !*is_charset_positive; 160 161 if (r == '\\') { 162 if (peek1 is void) { 163 return `Trailing backslash '\'`: error; 164 } else { 165 append(charsets[len(charsets) - 1], 166 peek1: charset_lit_item); 167 strings::next(iter); 168 *r_idx += 1; 169 }; 170 } else if (r == ']' && !is_first_char) { 171 const newinst = inst_charset { 172 idx = len(charsets) - 1, 173 is_positive = *is_charset_positive, 174 }; 175 append(insts, newinst); 176 *in_bracket = false; 177 *bracket_idx = -1; 178 *is_charset_positive = true; 179 } else if (r == '^' && *bracket_idx == 0) { 180 *is_charset_positive = false; 181 } else if (r == '[' && !(peek1 is void) 182 && peek1 as rune == ':') { 183 const rest = strings::iterstr(iter); 184 const n_cc = len(charclass_map); 185 for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) { 186 if (strings::hasprefix(rest, charclass_map[cc_idx].0)) { 187 append(charsets[len(charsets) - 1], 188 charclass_map[cc_idx].1); 189 *skip_charclass_rest = true; 190 break; 191 }; 192 }; 193 if (!*skip_charclass_rest) { 194 return `No character class after '[:'`: error; 195 }; 196 } else if (is_range) { 197 const start_b = r: u32; 198 const end_b = range_end as rune: u32; 199 200 if (end_b < start_b) { 201 return `Decending bracket expression range '[z-a]'`: error; 202 }; 203 204 append(charsets[len(charsets) - 1], 205 (start_b, end_b): charset_range_item); 206 strings::next(iter); 207 strings::next(iter); 208 *r_idx += 2; 209 } else { 210 append(charsets[len(charsets) - 1], 211 r: charset_lit_item); 212 }; 213 214 *r_idx += 1; 215 }; 216 217 // Compiles a regular expression string into a [[regex]]. 218 export fn compile(expr: str) (regex | error) = { 219 let insts: []inst = []; 220 let charsets: []charset = []; 221 let iter = strings::iter(expr); 222 let r_idx = 0z; 223 let anchored = false; 224 let curr_alt_jump_idx = -1; 225 let in_bracket = false; 226 let skip_charclass_rest = false; 227 let bracket_idx = -1; 228 let is_charset_positive = true; 229 let n_reps = 0z; 230 let n_groupstarts = 0; 231 232 for (true) { 233 const next = strings::next(&iter); 234 235 if (r_idx == 0 && next is rune && next: rune != '^') { 236 append(insts, void: inst_skip); 237 }; 238 239 if (in_bracket) { 240 if (next is void) { 241 return `Unmatched '['`: error; 242 }; 243 const r = next: rune; 244 handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter, 245 &charsets, &skip_charclass_rest, 246 &is_charset_positive, 247 &in_bracket)?; 248 continue; 249 }; 250 251 const r = match (next) { 252 case void => 253 if (n_groupstarts > 0) { 254 return `Unmatched '('`: error; 255 }; 256 break; 257 case let r: rune => yield r; 258 }; 259 switch (r) { 260 case '\\' => 261 const peek1 = strings::next(&iter); 262 if (peek1 is void) { 263 return `Trailing backslash '\'`: error; 264 } else { 265 append(insts, (peek1 as rune): inst_lit); 266 r_idx += 1; 267 }; 268 case '^' => 269 if (r_idx != 0) { 270 return `Anchor '^' not at start`: error; 271 }; 272 case '$' => 273 if (r_idx != len(expr) - 1) { 274 return `Anchor '$' not at end`: error; 275 }; 276 anchored = true; 277 case '[' => 278 in_bracket = true; 279 case ']' => 280 append(insts, r: inst_lit); 281 case '(' => 282 if (n_groupstarts > 0) { 283 return `Nested capture groups are unsupported`: error; 284 }; 285 append(insts, void: inst_groupstart); 286 n_groupstarts += 1; 287 case ')' => 288 if (n_groupstarts == 0) { 289 return `Unmatched ')'`: error; 290 }; 291 n_groupstarts -= 1; 292 append(insts, void: inst_groupend); 293 if (curr_alt_jump_idx != -1) { 294 assert(insts[curr_alt_jump_idx] is inst_jump); 295 insts[curr_alt_jump_idx] = 296 (len(insts) - 1): inst_jump; 297 curr_alt_jump_idx = -1; 298 }; 299 case '|' => 300 append(insts, 9999999: inst_jump); 301 const origin = find_last_groupstart(&insts)? + 1; 302 const newinst = (len(insts) + 1): inst_split; 303 insert(insts[origin], newinst); 304 curr_alt_jump_idx = (len(insts) - 1): int; 305 case '{' => 306 let origin = len(insts) - 1; 307 if (insts[origin] is inst_groupend) { 308 origin = find_last_groupstart(&insts)?; 309 }; 310 const rest = strings::iterstr(&iter); 311 const rep_parts = parse_repetition(rest)?; 312 const can_skip = rep_parts.0 == 0; 313 const min = if (rep_parts.0 == 0) { 314 yield 1z; 315 } else { 316 yield rep_parts.0; 317 }; 318 if (can_skip) { 319 insert(insts[origin], 320 len(insts) + 2: inst_split); 321 origin += 1; 322 }; 323 const newinst = inst_repeat { 324 id = n_reps, 325 origin = origin, 326 min = min, 327 max = rep_parts.1, 328 }; 329 for (let i = 0z; i <= rep_parts.2; i += 1) { 330 strings::next(&iter); 331 r_idx += 1; 332 }; 333 append(insts, newinst); 334 n_reps += 1; 335 case '?' => 336 if (r_idx == 0 || len(insts) == 0) { 337 return `Unused '?'`: error; 338 }; 339 let term_start_idx = len(insts) - 1; 340 match (insts[term_start_idx]) { 341 case (inst_lit | inst_charset | inst_any) => void; 342 case inst_groupend => 343 term_start_idx = find_last_groupstart(&insts)?; 344 case inst_groupstart => 345 return `Unused '?'`: error; 346 case => 347 return `Misused '?'`: error; 348 }; 349 const after_idx = len(insts) + 1; 350 insert(insts[term_start_idx], after_idx: inst_split); 351 case '*' => 352 if (r_idx == 0 || len(insts) == 0) { 353 return `Unused '*'`: error; 354 }; 355 const new_inst_offset = 1z; 356 const jump_idx = len(insts) + new_inst_offset; 357 const after_idx = jump_idx + 1z; 358 let term_start_idx = len(insts) - 1z; 359 match (insts[term_start_idx]) { 360 case (inst_lit | inst_charset | inst_any) => void; 361 case inst_groupend => 362 term_start_idx = find_last_groupstart(&insts)?; 363 case inst_groupstart => 364 return `Unused '*'`: error; 365 case => 366 return `Misused '*'`: error; 367 }; 368 const split_idx = term_start_idx; 369 term_start_idx += new_inst_offset; 370 insert(insts[split_idx], after_idx: inst_split); 371 append(insts, split_idx: inst_jump); 372 case '+' => 373 if (r_idx == 0 || len(insts) == 0) { 374 return `Unused '+'`: error; 375 }; 376 let term_start_idx = len(insts) - 1; 377 match (insts[term_start_idx]) { 378 case (inst_lit | inst_charset | inst_any) => void; 379 case inst_groupend => 380 term_start_idx = find_last_groupstart(&insts)?; 381 case inst_groupstart => 382 return `Unused '+'`: error; 383 case => 384 return `Misused '+'`: error; 385 }; 386 append(insts, term_start_idx: inst_split); 387 case '.' => 388 append(insts, void: inst_any); 389 case => 390 append(insts, r: inst_lit); 391 }; 392 r_idx += 1; 393 }; 394 395 append(insts, anchored: inst_match); 396 397 return regex { 398 insts = insts, 399 charsets = charsets, 400 n_reps = n_reps, 401 }; 402 }; 403 404 fn parse_repetition( 405 s: str 406 ) (((void | size), (void | size), size) | error) = { 407 const first_comma = strings::index(s, ","); 408 const first_endbrace = strings::index(s, "}"); 409 if (first_endbrace is void) { 410 return `Repetition expression syntax error '{n}'`: error; 411 }; 412 const first_endbrace = first_endbrace as size; 413 414 let min_str = ""; 415 let max_str = ""; 416 let is_single_arg = false; 417 if (first_comma is void || first_endbrace < first_comma as size) { 418 const cut = strings::cut(s, "}"); 419 min_str = cut.0; 420 max_str = cut.0; 421 is_single_arg = true; 422 } else { 423 const cut = strings::cut(s, ","); 424 min_str = cut.0; 425 max_str = strings::cut(cut.1, "}").0; 426 }; 427 428 let min: (void | size) = void; 429 let max: (void | size) = void; 430 431 if (len(min_str) > 0) { 432 min = match (strconv::stoi(min_str)) { 433 case let res: int => 434 yield if (res < 0) { 435 return `Negative repitition count '{-n}'`: error; 436 } else { 437 yield res: size; 438 }; 439 case => return `Repetition expression syntax error '{n}'`: error; 440 }; 441 } else { 442 min = 0; 443 }; 444 445 if (len(max_str) > 0) { 446 max = match (strconv::stoi(max_str)) { 447 case let res: int => 448 yield if (res < 0) { 449 return `Negative repitition count '{-n}'`: error; 450 } else { 451 yield res: size; 452 }; 453 case => return `Repetition expression syntax error '{n}'`: error; 454 }; 455 }; 456 457 const rep_len = if (is_single_arg) { 458 yield len(min_str); 459 } else { 460 yield len(min_str) + 1 + len(max_str); 461 }; 462 return (min, max, rep_len); 463 }; 464 465 fn delete_thread(i: size, threads: *[]thread) void = { 466 free(threads[i].captures); 467 free(threads[i].rep_counters); 468 delete(threads[i]); 469 }; 470 471 fn is_consuming_inst(a: inst) bool = { 472 return a is (inst_lit | inst_any | inst_charset); 473 }; 474 475 fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = { 476 // Do not add this thread if there is already another thread with 477 // the same PC 478 for (let i = 0z; i < len(threads); i += 1) { 479 if (threads[i].pc == new_pc 480 && !threads[i].matched 481 && threads[i].start_idx 482 < threads[parent_idx].start_idx) { 483 return; 484 }; 485 }; 486 487 append(threads, thread { 488 pc = new_pc, 489 start_idx = threads[parent_idx].start_idx, 490 start_bytesize = threads[parent_idx].start_bytesize, 491 curr_capture = threads[parent_idx].curr_capture, 492 curr_capture_inited = threads[parent_idx].curr_capture_inited, 493 matched = threads[parent_idx].matched, 494 failed = threads[parent_idx].failed, 495 captures = alloc(threads[parent_idx].captures...), 496 rep_counters = alloc(threads[parent_idx].rep_counters...), 497 ... 498 }); 499 }; 500 501 fn run_thread( 502 i: size, 503 re: *regex, 504 string: str, 505 threads: *[]thread, 506 r_or_end: (rune | io::EOF), 507 str_idx: size, 508 str_bytesize: size 509 ) (void | newmatch) = { 510 const str_bytes = strings::toutf8(string); 511 if (threads[i].matched) { 512 return; 513 }; 514 for (!is_consuming_inst(re.insts[threads[i].pc])) { 515 match (re.insts[threads[i].pc]) { 516 case inst_lit => abort(); 517 case inst_any => abort(); 518 case inst_split => 519 const new_pc = re.insts[threads[i].pc]: inst_split: size; 520 add_thread(threads, i, new_pc); 521 threads[i].pc += 1; 522 case inst_jump => 523 threads[i].pc = re.insts[threads[i].pc]: inst_jump: size; 524 case inst_skip => 525 const new_pc = threads[i].pc + 1; 526 threads[i].start_idx = str_idx; 527 threads[i].start_bytesize = str_bytesize; 528 add_thread(threads, i, new_pc); 529 break; 530 case let anchored: inst_match => 531 // Do not match if we need an end-anchored match, but we 532 // have not exhausted our string 533 if (anchored && !(r_or_end is io::EOF)) { 534 threads[i].failed = true; 535 return; 536 }; 537 const content = strings::fromutf8_unsafe(str_bytes[ 538 threads[i].start_bytesize..str_bytesize]); 539 threads[i].root_capture = capture { 540 start = threads[i].start_idx, 541 start_bytesize = threads[i].start_bytesize, 542 end = str_idx, 543 end_bytesize = str_bytesize, 544 content = content, 545 }; 546 threads[i].matched = true; 547 return newmatch; 548 case inst_groupstart => 549 assert(!threads[i].curr_capture_inited, "Found nested capture groups in expression, which are not supported"); 550 threads[i].curr_capture.start = str_idx; 551 threads[i].curr_capture.start_bytesize = str_bytesize; 552 threads[i].curr_capture_inited = true; 553 threads[i].pc += 1; 554 case inst_groupend => 555 assert(threads[i].curr_capture_inited, `Found a groupend token ")" without having previously seen a groupstart token "(". Please report this as a bug`); 556 threads[i].curr_capture.end = str_idx; 557 threads[i].curr_capture.end_bytesize = str_bytesize; 558 threads[i].curr_capture.content = 559 strings::fromutf8_unsafe(str_bytes[ 560 threads[i].curr_capture.start_bytesize.. 561 threads[i].curr_capture.end_bytesize]); 562 append(threads[i].captures, threads[i].curr_capture); 563 threads[i].curr_capture = capture { ... }; 564 threads[i].curr_capture_inited = false; 565 threads[i].pc += 1; 566 case let ir: inst_repeat => 567 assert(ir.id < len(threads[i].rep_counters)); 568 threads[i].rep_counters[ir.id] += 1; 569 if (ir.max is size 570 && threads[i].rep_counters[ir.id] 571 > ir.max as size) { 572 threads[i].failed = true; 573 return; 574 }; 575 const new_pc = threads[i].pc + 1; 576 threads[i].pc = ir.origin; 577 if (ir.min is void 578 || threads[i].rep_counters[ir.id] 579 >= ir.min as size) { 580 add_thread(threads, i, new_pc); 581 }; 582 }; 583 }; 584 585 // From now on, we're only matching consuming instructions, and these 586 // can't do anything without another rune. 587 if (r_or_end is io::EOF) { 588 threads[i].failed = true; 589 return; 590 }; 591 592 const r = r_or_end as rune; 593 594 match (re.insts[threads[i].pc]) { 595 case inst_skip => return; 596 case let lit: inst_lit => 597 if (r != lit) { 598 threads[i].failed = true; 599 }; 600 case inst_any => void; 601 case let cs: inst_charset => 602 const charset = re.charsets[cs.idx]; 603 // Disprove the match if we're looking for a negative match 604 // Prove the match if we're looking for a positive match 605 let matched = !cs.is_positive; 606 for (let i = 0z; i < len(charset); i += 1) match (charset[i]) { 607 case let lit: charset_lit_item => 608 if (r == lit) { 609 // Succeeded if positive match 610 // Failed if negative match 611 matched = cs.is_positive; 612 break; 613 }; 614 case let range: charset_range_item => 615 const r_b = r: u32; 616 617 if (r_b >= range.0 && r_b <= range.1) { 618 // Succeeded if positive match 619 // Failed if negative match 620 matched = cs.is_positive; 621 break; 622 }; 623 case let classfn: charset_class_item => 624 if (classfn(r)) { 625 // Succeeded if positive match 626 // Failed if negative match 627 matched = cs.is_positive; 628 break; 629 }; 630 }; 631 if (!matched) { 632 threads[i].failed = true; 633 }; 634 }; 635 636 threads[i].pc += 1; 637 }; 638 639 // Attempts to match a regular expression against a string and returns the 640 // either the longest leftmost match or all matches. 641 fn search( 642 re: *regex, 643 string: str, 644 handle: io::handle, 645 need_captures: bool 646 ) (void | []capture) = { 647 let threads: []thread = alloc([ 648 thread { captures = [], ... } 649 ]); 650 if (re.n_reps > 0) { 651 threads[0].rep_counters = alloc([0...], re.n_reps); 652 }; 653 defer { 654 for (let i = 0z; i < len(threads); i += 1) { 655 free(threads[i].captures); 656 free(threads[i].rep_counters); 657 }; 658 free(threads); 659 }; 660 661 let str_idx = 0z; 662 let first_match_idx: (void | size) = void; 663 let str_bytesize = 0z; 664 let last_bytesize = 0z; 665 666 for (true) { 667 str_bytesize += last_bytesize; 668 669 if (len(threads) == 0) { 670 return void; 671 }; 672 673 let all_matched = true; 674 for (let i = 0z; i < len(threads); i += 1) { 675 if (!threads[i].matched) { 676 all_matched = false; 677 break; 678 }; 679 }; 680 681 if (all_matched) { 682 let best_len = 0z; 683 let best_n_captures = 0z; 684 let best_idx = 0z; 685 for (let i = 0z; i < len(threads); i += 1) { 686 let match_len = threads[i].root_capture.end 687 - threads[i].root_capture.start; 688 const is_better = match_len > best_len 689 || match_len == best_len 690 && len(threads[i].captures) 691 > best_n_captures; 692 if (is_better) { 693 best_len = match_len; 694 best_idx = i; 695 best_n_captures = len(threads[i].captures); 696 }; 697 }; 698 let res: []capture = alloc([], 699 len(threads[best_idx].captures) + 1); 700 append(res, threads[best_idx].root_capture); 701 append(res, threads[best_idx].captures...); 702 return res; 703 }; 704 705 const r_or_end = bufio::scanrune(handle)!; 706 if (r_or_end is rune) { 707 last_bytesize = utf8::runesz(r_or_end as rune); 708 }; 709 710 for (let i = 0z; i < len(threads); i += 1) { 711 const res = run_thread(i, re, string, &threads, 712 r_or_end, str_idx, str_bytesize); 713 const matchlen = threads[i].root_capture.end 714 - threads[i].root_capture.start; 715 if (res is newmatch && matchlen > 0 && !need_captures) { 716 return []: []capture; 717 }; 718 const is_better = res is newmatch && matchlen > 0 719 && (first_match_idx is void 720 || threads[i].start_idx 721 < first_match_idx as size); 722 if (is_better) { 723 first_match_idx = threads[i].start_idx; 724 }; 725 }; 726 str_idx += 1; 727 728 // When we only want the leftmost match, delete all threads that 729 // start after the earliest non-zero-length matched thread 730 if (first_match_idx is size) { 731 for (let i = 0z; i < len(threads); i += 1) { 732 if (threads[i].start_idx 733 > first_match_idx as size) { 734 threads[i].failed = true; 735 }; 736 }; 737 }; 738 739 // Delete threads that have a PC that has already been 740 // encountered in previous threads. Prioritise threads that 741 // have an earlier start_idx, and threads that were added 742 // earlier. 743 for (let i = 0i64; i < len(threads): i64 - 1; i += 1) { 744 for (let j = i + 1; j < len(threads): i64; j += 1) { 745 const same_pc = threads[i].pc == threads[j].pc; 746 const none_matched = !threads[j].matched 747 && !threads[i].matched; 748 if (same_pc && none_matched) { 749 if (threads[i].start_idx 750 <= threads[j].start_idx) { 751 delete_thread(j: size, &threads); 752 j -= 1; 753 } else { 754 delete_thread(i: size, &threads); 755 i -= 1; 756 break; 757 }; 758 }; 759 }; 760 }; 761 762 for (let i = 0z; i < len(threads); i += 1) { 763 if (threads[i].failed) { 764 delete_thread(i, &threads); 765 i -= 1; 766 }; 767 }; 768 }; 769 770 return void; 771 }; 772 773 // Returns whether or not a [[regex]] matches any part of a given string. 774 export fn test(re: *regex, string: str) bool = { 775 let strm = bufio::fixed(strings::toutf8(string), io::mode::READ); 776 return search(re, string, &strm, false) is []capture; 777 }; 778 779 780 // Attempts to match a [[regex]] against a string and returns the longest 781 // leftmost match as a [[result]]. The caller must free the return value with 782 // [[result_free]]. 783 export fn find(re: *regex, string: str) result = { 784 let strm = bufio::fixed(strings::toutf8(string), io::mode::READ); 785 match (search(re, string, &strm, true)) { 786 case let m: []capture => 787 return m; 788 case void => 789 return []; 790 }; 791 }; 792 793 // Attempts to match a [[regex]] against a string and returns all 794 // non-overlapping matches as a slice of [[result]]s. The caller must free the 795 // return value with [[result_freeall]]. 796 export fn findall(re: *regex, string: str) []result = { 797 let res: [][]capture = []; 798 let str_idx = 0z, str_bytesize = 0z; 799 let substring = string; 800 let strm = bufio::fixed(strings::toutf8(string), io::mode::READ); 801 const str_bytes = strings::toutf8(string); 802 for (true) { 803 match (search(re, substring, &strm, true)) { 804 case let m: []capture => 805 append(res, m); 806 m[0].start += str_idx; 807 m[0].end += str_idx; 808 m[0].start_bytesize += str_bytesize; 809 m[0].end_bytesize += str_bytesize; 810 str_idx = m[0].end; 811 str_bytesize = m[0].end_bytesize; 812 substring = strings::fromutf8(str_bytes[str_bytesize..])!; 813 io::seek(&strm, str_bytesize: io::off, 814 io::whence::SET)!; 815 case void => break; 816 }; 817 }; 818 return res; 819 }; 820 821 // Replaces all non-overlapping matches of a regular expression against a string 822 // with 'targetstr'. 823 // 824 // A backslash followed by a single decimal number within 'targetstr' is 825 // replaced by the capture at that index (starting at 1), or an empty string if 826 // no such capture exists. For example, `\1` is replaced with the first capture, 827 // `\2` with the second, etc. `\0` is substituted with the entire substring that 828 // was matched. `\\` is replaced with a literal backslash. 829 // 830 // An error is only returned if 'targetstr' isn't formatted correctly. 831 export fn replace(re: *regex, string: str, targetstr: str) (str | error) = { 832 const matches = findall(re, string); 833 if (len(matches) == 0) { 834 return strings::dup(string); 835 }; 836 defer result_freeall(matches); 837 838 const target = parse_replace_target(targetstr)?; 839 defer free(target); 840 841 const bytes = strings::toutf8(string); 842 let buf = alloc(bytes[..matches[0][0].start_bytesize]...); 843 844 for (let i = 0z; i < len(matches); i += 1) { 845 for (let j = 0z; j < len(target); j += 1) { 846 match (target[j]) { 847 case let b: []u8 => 848 append(buf, b...); 849 case let z: size => 850 if (z >= len(matches[i])) yield; 851 const b = strings::toutf8(matches[i][z].content); 852 append(buf, b...); 853 }; 854 }; 855 const start = matches[i][0].end_bytesize; 856 const end = if (i == len(matches) - 1) len(bytes) 857 else matches[i + 1][0].start_bytesize; 858 append(buf, bytes[start..end]...); 859 }; 860 861 return strings::fromutf8(buf)!; 862 }; 863 864 fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = { 865 const bytes = strings::toutf8(targetstr); 866 let target: []([]u8 | size) = alloc([], 1); 867 let iter = strings::iter(targetstr); 868 let start = 0z, end = 0z; 869 for (true) match (strings::next(&iter)) { 870 case void => 871 if (start != end) { 872 append(target, bytes[start..]); 873 }; 874 break; 875 case let r: rune => 876 if (r == '\\') { 877 if (start != end) { 878 append(target, bytes[start..end]); 879 }; 880 881 const r = match (strings::next(&iter)) { 882 case void => 883 return "Trailing backslash": error; 884 case let r: rune => 885 yield r; 886 }; 887 888 if (r == '\\') { 889 append(target, '\\'); 890 } else if (ascii::isdigit(r)) { 891 append(target, r: u32: size - 0x30); 892 } else { 893 return "Backslash must be followed by positive decimal number or a backslash": error; 894 }; 895 896 end += 2; 897 start = end; 898 } else { 899 end += utf8::runesz(r); 900 }; 901 }; 902 903 return target; 904 }; 905 906 // Replaces all non-overlapping matches of a regular expression against a string 907 // with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all 908 // backslashes are treated literally. 909 export fn rawreplace(re: *regex, string: str, targetstr: str) str = { 910 const matches = findall(re, string); 911 if (len(matches) == 0) { 912 return strings::dup(string); 913 }; 914 defer result_freeall(matches); 915 916 const target = strings::toutf8(targetstr); 917 const bytes = strings::toutf8(string); 918 let buf: []u8 = []; 919 920 append(buf, bytes[..matches[0][0].start_bytesize]...); 921 for (let i = 1z; i < len(matches); i += 1) { 922 append(buf, target...); 923 const start = matches[i - 1][0].end_bytesize; 924 const end = matches[i][0].start_bytesize; 925 append(buf, bytes[start..end]...); 926 }; 927 append(buf, target...); 928 append(buf, bytes[matches[len(matches) - 1][0].end_bytesize..]...); 929 930 return strings::fromutf8(buf)!; 931 }; 932 933 // Frees a [[result]]. 934 export fn result_free(s: result) void = { 935 free(s); 936 }; 937 938 // Frees a slice of [[result]]s. 939 export fn result_freeall(s: []result) void = { 940 for (let i = 0z; i < len(s); i += 1) { 941 result_free(s[i]); 942 }; 943 free(s); 944 }; 945 946 // Converts an [[error]] into a user-friendly string. 947 export fn strerror(err: error) str = err;