commit 8d2d67b2cbcd5c6322e9fb12c63ee107ed841da6
parent a7f8a6f22db5acfb24887256da0d0a3dce2bf5af
Author: Vlad-Stefan Harbuz <vlad@vlad.website>
Date: Sat, 28 Dec 2024 17:32:38 +0000
regex: fix `jump_idxs` interference
Take this case: a|(b)
We need to remember the position of the `|`, because we will need it
when we reach the end of the string.
However, when we encounter the `)`, all `jump_idxs` are deleted; uh oh!
This commit makes it so that `jump_idxs` are stored per `group_level`,
so as not to interfere with each other.
Signed-off-by: Vlad-Stefan Harbuz <vlad@vlad.website>
Diffstat:
2 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -596,6 +596,8 @@ fn run_rawreplace_case(
(`ab|^cd`, "bcd", matchres::NOMATCH, 0, 0),
(`ab|^cd`, "cde", matchres::MATCH, 0, 2),
(`ab\|^cd`, "cde", matchres::ERROR, 0, 0),
+ (`a|(b)`, "a", matchres::MATCH, 0, -1),
+ (`(a|(b?|c*){,1}|d+|e)`, "e", matchres::MATCH, 0, -1),
// Multiple alternation
(`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
(`a|b|c|d|e`, "xe", matchres::MATCH, 1, -1),
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -238,7 +238,8 @@ export fn compile(expr: str) (regex | error) = {
let charsets: []charset = [];
let iter = strings::iter(expr);
let r_idx = 0z;
- let jump_idxs: []size = [];
+ let jump_idxs: [][]size = [];
+ append(jump_idxs, []);
defer free(jump_idxs);
let in_bracket = false;
let skip_charclass_rest = false;
@@ -312,17 +313,20 @@ export fn compile(expr: str) (regex | error) = {
append(insts, capture_idx: inst_groupstart);
group_level += 1;
capture_idx += 1;
+ for (len(jump_idxs) < group_level + 1) {
+ append(jump_idxs, []);
+ };
case ')' =>
if (group_level == 0) {
return `Unmatched ')'`: error;
};
- group_level -= 1;
append(insts, inst_groupend);
- for (let jump_idx .. jump_idxs) {
+ for (let jump_idx .. jump_idxs[group_level]) {
assert(insts[jump_idx] is inst_jump);
insts[jump_idx] = (len(insts) - 1): inst_jump;
};
- delete(jump_idxs[..]);
+ delete(jump_idxs[group_level][..]);
+ group_level -= 1;
case '|' =>
append(insts, types::SIZE_MAX: inst_jump);
const origin = match (find_last_groupstart(insts)) {
@@ -333,11 +337,18 @@ export fn compile(expr: str) (regex | error) = {
};
const newinst = (len(insts) + 1): inst_split;
// add split after last jump (if any) or at origin
- const split_idx = if (len(jump_idxs) > 0)
- jump_idxs[len(jump_idxs) - 1] + 1 else origin;
+ const split_idx = if (len(jump_idxs[group_level]) > 0)
+ jump_idxs[group_level][len(jump_idxs[group_level]) - 1] + 1 else origin;
insert(insts[split_idx], newinst);
shift(insts[split_idx + 1..]);
- append(jump_idxs, len(insts) - 1);
+ // we have to now fix up all affected jump_idxs, since
+ // we've shifted them over using the insert
+ for (let jump_idx &.. jump_idxs[group_level]) {
+ if (*jump_idx >= split_idx) {
+ *jump_idx += 1;
+ };
+ };
+ append(jump_idxs[group_level], len(insts) - 1);
// add skip if it's a whole-expression alternation
if (origin == 0) {
const peek1 = strings::next(&iter);
@@ -450,7 +461,7 @@ export fn compile(expr: str) (regex | error) = {
};
// handle whole expression alternation
- for (let jump_idx .. jump_idxs) {
+ for (let jump_idx .. jump_idxs[0]) {
assert(insts[jump_idx] is inst_jump);
insts[jump_idx] = len(insts): inst_jump;
};