commit 2e40a9474e3e1c588ea0ba392c1c698dccf295f3
parent f07c16a29c2f6ea89f97ead9191cec0deadf5e98
Author: Max Schillinger <max@mxsr.de>
Date: Wed, 17 Jul 2024 21:11:00 +0200
regex: fix anchor bug in whole-expression alternations
The regex pattern `a|b` doesn't match `xb` because it contains an
implicit start of line anchor (`^`) before the `b`. The pattern behaves
like `a|^b`.
This commit fixes this by adding an `inst_skip` at every
whole-expression alternation.
Signed-off-by: Max Schillinger <max@mxsr.de>
Diffstat:
2 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -579,8 +579,12 @@ fn run_rawreplace_case(
(`ab|cd`, "cd", matchres::MATCH, 0, 2),
(`ab|cd`, "abc", matchres::MATCH, 0, 2),
(`ab|cd`, "abcd", matchres::MATCH, 0, 2),
+ (`ab|cd`, "bcd", matchres::MATCH, 1, 3),
+ (`^ab|cd`, "bcd", matchres::MATCH, 1, 3),
+ (`^ab|cd`, "zab", matchres::NOMATCH, 0, 0),
// multiple alternation
(`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
+ (`a|b|c|d|e`, "xe", matchres::MATCH, 1, -1),
(`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1),
// TODO: nested capture groups
(`((a))`, "abc", matchres::ERROR, 0, -1),
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -311,6 +311,10 @@ export fn compile(expr: str) (regex | error) = {
jump_idxs[len(jump_idxs) - 1] + 1 else origin;
insert(insts[split_idx], newinst);
append(jump_idxs, len(insts) - 1);
+ // add skip if it's a whole-expression alternation
+ if (origin == 0) {
+ append(insts, inst_skip);
+ };
case '{' =>
let origin = len(insts) - 1;
if (insts[origin] is inst_groupend) {