hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 2e40a9474e3e1c588ea0ba392c1c698dccf295f3
parent f07c16a29c2f6ea89f97ead9191cec0deadf5e98
Author: Max Schillinger <max@mxsr.de>
Date:   Wed, 17 Jul 2024 21:11:00 +0200

regex: fix anchor bug in whole-expression alternations

The regex pattern `a|b` doesn't match `xb` because it contains an
implicit start of line anchor (`^`) before the `b`. The pattern behaves
like `a|^b`.

This commit fixes this by adding an `inst_skip` at every
whole-expression alternation.

Signed-off-by: Max Schillinger <max@mxsr.de>

Diffstat:
Mregex/+test.ha | 4++++
Mregex/regex.ha | 4++++
2 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -579,8 +579,12 @@ fn run_rawreplace_case( (`ab|cd`, "cd", matchres::MATCH, 0, 2), (`ab|cd`, "abc", matchres::MATCH, 0, 2), (`ab|cd`, "abcd", matchres::MATCH, 0, 2), + (`ab|cd`, "bcd", matchres::MATCH, 1, 3), + (`^ab|cd`, "bcd", matchres::MATCH, 1, 3), + (`^ab|cd`, "zab", matchres::NOMATCH, 0, 0), // multiple alternation (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1), + (`a|b|c|d|e`, "xe", matchres::MATCH, 1, -1), (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1), // TODO: nested capture groups (`((a))`, "abc", matchres::ERROR, 0, -1), diff --git a/regex/regex.ha b/regex/regex.ha @@ -311,6 +311,10 @@ export fn compile(expr: str) (regex | error) = { jump_idxs[len(jump_idxs) - 1] + 1 else origin; insert(insts[split_idx], newinst); append(jump_idxs, len(insts) - 1); + // add skip if it's a whole-expression alternation + if (origin == 0) { + append(insts, inst_skip); + }; case '{' => let origin = len(insts) - 1; if (insts[origin] is inst_groupend) {