commit bf1e316aced9e60aeb4d85bc38fa3158544eaf6c
parent 51d4feb9fa969343789246b5566f1b85ddbb6013
Author: Max Schillinger <max@mxsr.de>
Date: Mon, 15 Apr 2024 17:47:28 +0200
regex: implement multiple alternation
Example: The regex pattern `(ha|py|sh)` is implemented using the
following Hare Regular Expression Engine Virtual Machine NFA
Representation:
0 groupstart
1 split → 5 (split)
2 lit h
3 lit a
4 jump → 11 (groupend)
5 split → 9 (s)
6 lit p
7 lit y
8 jump → 11 (groupend)
9 lit s
10 lit h
11 groupend
Implements: https://todo.sr.ht/~sircmpwn/hare/696
Signed-off-by: Max Schillinger <max@mxsr.de>
Diffstat:
2 files changed, 17 insertions(+), 15 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -581,9 +581,9 @@ fn run_rawreplace_case(
(`ab|cd`, "cd", matchres::MATCH, 0, 2),
(`ab|cd`, "abc", matchres::MATCH, 0, 2),
(`ab|cd`, "abcd", matchres::MATCH, 0, 2),
- // TODO: multiple alternation
- // (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
- // (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1),
+ // multiple alternation
+ (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
+ (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1),
// TODO: nested capture groups
(`((a))`, "abc", matchres::ERROR, 0, -1),
// (`((a))`, "abc", matchres::MATCH, 0, -1),
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -223,7 +223,7 @@ export fn compile(expr: str) (regex | error) = {
let iter = strings::iter(expr);
let r_idx = 0z;
let anchored = false;
- let curr_alt_jump_idx = -1;
+ let jump_idxs: []size = [];
let in_bracket = false;
let skip_charclass_rest = false;
let bracket_idx = -1;
@@ -292,12 +292,11 @@ export fn compile(expr: str) (regex | error) = {
};
n_groupstarts -= 1;
append(insts, void: inst_groupend);
- if (curr_alt_jump_idx != -1) {
- assert(insts[curr_alt_jump_idx] is inst_jump);
- insts[curr_alt_jump_idx] =
- (len(insts) - 1): inst_jump;
- curr_alt_jump_idx = -1;
+ for (let jump_idx .. jump_idxs) {
+ assert(insts[jump_idx] is inst_jump);
+ insts[jump_idx] = (len(insts) - 1): inst_jump;
};
+ jump_idxs = [];
case '|' =>
append(insts, types::SIZE_MAX: inst_jump);
const origin = match (find_last_groupstart(&insts)) {
@@ -307,8 +306,11 @@ export fn compile(expr: str) (regex | error) = {
yield sz + 1;
};
const newinst = (len(insts) + 1): inst_split;
- insert(insts[origin], newinst);
- curr_alt_jump_idx = (len(insts) - 1): int;
+ // add split after last jump (if any) or at origin
+ const split_idx = if (len(jump_idxs) > 0)
+ jump_idxs[len(jump_idxs) - 1] + 1 else origin;
+ insert(insts[split_idx], newinst);
+ append(jump_idxs, len(insts) - 1);
case '{' =>
let origin = len(insts) - 1;
if (insts[origin] is inst_groupend) {
@@ -400,11 +402,11 @@ export fn compile(expr: str) (regex | error) = {
};
// handle whole expression alternation
- if (curr_alt_jump_idx != -1) {
- assert(insts[curr_alt_jump_idx] is inst_jump);
- insts[curr_alt_jump_idx] = len(insts): inst_jump;
- curr_alt_jump_idx = -1;
+ for (let jump_idx .. jump_idxs) {
+ assert(insts[jump_idx] is inst_jump);
+ insts[jump_idx] = len(insts): inst_jump;
};
+ jump_idxs = [];
append(insts, anchored: inst_match);