hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 8d2d67b2cbcd5c6322e9fb12c63ee107ed841da6
parent a7f8a6f22db5acfb24887256da0d0a3dce2bf5af
Author: Vlad-Stefan Harbuz <vlad@vlad.website>
Date:   Sat, 28 Dec 2024 17:32:38 +0000

regex: fix `jump_idxs` interference

Take this case: a|(b)

We need to remember the position of the `|`, because we will need it
when we reach the end of the string.

However, when we encounter the `)`, all `jump_idxs` are deleted; uh oh!

This commit makes it so that `jump_idxs` are stored per `group_level`,
so as not to interfere with each other.

Signed-off-by: Vlad-Stefan Harbuz <vlad@vlad.website>

Diffstat:
Mregex/+test.ha | 2++
Mregex/regex.ha | 27+++++++++++++++++++--------
2 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -596,6 +596,8 @@ fn run_rawreplace_case( (`ab|^cd`, "bcd", matchres::NOMATCH, 0, 0), (`ab|^cd`, "cde", matchres::MATCH, 0, 2), (`ab\|^cd`, "cde", matchres::ERROR, 0, 0), + (`a|(b)`, "a", matchres::MATCH, 0, -1), + (`(a|(b?|c*){,1}|d+|e)`, "e", matchres::MATCH, 0, -1), // Multiple alternation (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1), (`a|b|c|d|e`, "xe", matchres::MATCH, 1, -1), diff --git a/regex/regex.ha b/regex/regex.ha @@ -238,7 +238,8 @@ export fn compile(expr: str) (regex | error) = { let charsets: []charset = []; let iter = strings::iter(expr); let r_idx = 0z; - let jump_idxs: []size = []; + let jump_idxs: [][]size = []; + append(jump_idxs, []); defer free(jump_idxs); let in_bracket = false; let skip_charclass_rest = false; @@ -312,17 +313,20 @@ export fn compile(expr: str) (regex | error) = { append(insts, capture_idx: inst_groupstart); group_level += 1; capture_idx += 1; + for (len(jump_idxs) < group_level + 1) { + append(jump_idxs, []); + }; case ')' => if (group_level == 0) { return `Unmatched ')'`: error; }; - group_level -= 1; append(insts, inst_groupend); - for (let jump_idx .. jump_idxs) { + for (let jump_idx .. jump_idxs[group_level]) { assert(insts[jump_idx] is inst_jump); insts[jump_idx] = (len(insts) - 1): inst_jump; }; - delete(jump_idxs[..]); + delete(jump_idxs[group_level][..]); + group_level -= 1; case '|' => append(insts, types::SIZE_MAX: inst_jump); const origin = match (find_last_groupstart(insts)) { @@ -333,11 +337,18 @@ export fn compile(expr: str) (regex | error) = { }; const newinst = (len(insts) + 1): inst_split; // add split after last jump (if any) or at origin - const split_idx = if (len(jump_idxs) > 0) - jump_idxs[len(jump_idxs) - 1] + 1 else origin; + const split_idx = if (len(jump_idxs[group_level]) > 0) + jump_idxs[group_level][len(jump_idxs[group_level]) - 1] + 1 else origin; insert(insts[split_idx], newinst); shift(insts[split_idx + 1..]); - append(jump_idxs, len(insts) - 1); + // we have to now fix up all affected jump_idxs, since + // we've shifted them over using the insert + for (let jump_idx &.. jump_idxs[group_level]) { + if (*jump_idx >= split_idx) { + *jump_idx += 1; + }; + }; + append(jump_idxs[group_level], len(insts) - 1); // add skip if it's a whole-expression alternation if (origin == 0) { const peek1 = strings::next(&iter); @@ -450,7 +461,7 @@ export fn compile(expr: str) (regex | error) = { }; // handle whole expression alternation - for (let jump_idx .. jump_idxs) { + for (let jump_idx .. jump_idxs[0]) { assert(insts[jump_idx] is inst_jump); insts[jump_idx] = len(insts): inst_jump; };