commit 413a5e43a6bfa6f7d7e0fca27a367e92c0ede2da
parent d1cf40891bc75c036e9ce60ab5bc9cfad32baf74
Author: Max Schillinger <max@mxsr.de>
Date: Sun, 14 Apr 2024 22:44:24 +0200
regex: implement whole-expression alternation
When a pattern contains a `|` outside of a capture group, make this the
jump position of a whole-expression alternation.
Signed-off-by: Max Schillinger <max@mxsr.de>
Implements: https://todo.sr.ht/~sircmpwn/hare/695
Diffstat:
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -577,9 +577,10 @@ fn run_rawreplace_case(
(`(a|ab)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
(`(ab|a)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
(`(ab|a)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
- // TODO: whole-expression alternation
- // (`ab|cd`, "abc", matchres::MATCH, 0, -1),
- // (`ab|cd`, "abcd", matchres::MATCH, 0, -1),
+ // whole-expression alternation
+ (`ab|cd`, "cd", matchres::MATCH, 0, 2),
+ (`ab|cd`, "abc", matchres::MATCH, 0, 2),
+ (`ab|cd`, "abcd", matchres::MATCH, 0, 2),
// TODO: multiple alternation
// (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
// (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1),
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -300,7 +300,12 @@ export fn compile(expr: str) (regex | error) = {
};
case '|' =>
append(insts, types::SIZE_MAX: inst_jump);
- const origin = find_last_groupstart(&insts)? + 1;
+ const origin = match (find_last_groupstart(&insts)) {
+ case error =>
+ yield 0z;
+ case let sz: size =>
+ yield sz + 1;
+ };
const newinst = (len(insts) + 1): inst_split;
insert(insts[origin], newinst);
curr_alt_jump_idx = (len(insts) - 1): int;
@@ -394,6 +399,13 @@ export fn compile(expr: str) (regex | error) = {
r_idx += 1;
};
+ // handle whole expression alternation
+ if (curr_alt_jump_idx != -1) {
+ assert(insts[curr_alt_jump_idx] is inst_jump);
+ insts[curr_alt_jump_idx] = len(insts): inst_jump;
+ curr_alt_jump_idx = -1;
+ };
+
append(insts, anchored: inst_match);
return regex {