commit e62e47b5de35c8dde6dd2e917f54d3118d68c6dc
parent 798e7b63d399ae347bf85bd512581cfc563bfc5e
Author: Sebastian <sebastian@sebsite.pw>
Date: Sun, 23 Apr 2023 02:41:35 -0400
regex: add replace and rawreplace
Implements: https://todo.sr.ht/~sircmpwn/hare/710
Signed-off-by: Sebastian <sebastian@sebsite.pw>
Diffstat:
M | regex/+test.ha | | | 110 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | regex/regex.ha | | | 112 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 222 insertions(+), 0 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -133,6 +133,71 @@ fn run_findall_case(
};
};
+fn run_replace_case(
+ expr: str,
+ string: str,
+ target: str,
+ expected: (str | void),
+) void = {
+ const re = match (compile(expr)) {
+ case let re: regex => yield re;
+ case let e: error =>
+ fmt::errorln(e)!;
+ fmt::errorfln("Expected expression /{}/ to compile, but it errored",
+ expr)!;
+ abort();
+ };
+ defer finish(&re);
+
+ match (replace(&re, string, target)) {
+ case let e: error =>
+ if (expected is str) {
+ fmt::errorln(e)!;
+ fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\"",
+ expr, string, target, expected as str)!;
+ abort();
+ };
+ case let s: str =>
+ defer free(s);
+ if (expected is void) {
+ fmt::errorln("Expected replace to fail, but it did not")!;
+ fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" return=\"{}\"",
+ expr, string, target, s)!;
+ abort();
+ };
+ if (expected as str != s) {
+ fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\" return=\"{}\"",
+ expr, string, target, expected as str, s)!;
+ abort();
+ };
+ };
+};
+
+fn run_rawreplace_case(
+ expr: str,
+ string: str,
+ target: str,
+ expected: str,
+) void = {
+ const re = match (compile(expr)) {
+ case let re: regex => yield re;
+ case let e: error =>
+ fmt::errorln(e)!;
+ fmt::errorfln("Expected expression /{}/ to compile, but it errored",
+ expr)!;
+ abort();
+ };
+ defer finish(&re);
+
+ const s = rawreplace(&re, string, target);
+ defer free(s);
+ if (expected != s) {
+ fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\" return=\"{}\"",
+ expr, string, target, expected, s)!;
+ abort();
+ };
+};
+
@test fn find() void = {
const cases = [
// literals
@@ -605,3 +670,48 @@ fn run_findall_case(
run_findall_case(expr, string, should_match, targets);
};
};
+
+@test fn replace() void = {
+ const cases: [_](str, str, str, (str | void)) = [
+ (`ab.`, "hello abc and abあ test abq thanks", `xyz`,
+ "hello xyz and xyz test xyz thanks"),
+ (`([Hh])ello`, "Hello world and hello Hare.", `\1owdy`,
+ "Howdy world and howdy Hare."),
+ (`fo{2,}`, "fo foo fooofoof oofoo", `\0bar`,
+ "fo foobar fooobarfoobarf oofoobar"),
+ (`(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)`, "12345678910", `\10`,
+ "10"),
+ (`...?`, "abcdefgh", `\7\0\8`,
+ "abcdefgh"),
+ (`...?`, "abcdefgh", `\7\0\`, void),
+ ];
+
+ for (let i = 0z; i < len(cases); i += 1) {
+ const expr = cases[i].0;
+ const string = cases[i].1;
+ const target = cases[i].2;
+ const expected = cases[i].3;
+ run_replace_case(expr, string, target, expected);
+ };
+};
+
+@test fn rawreplace() void = {
+ const cases = [
+ (`ab.`, "hello abc and abあ test abq thanks", "xyz",
+ "hello xyz and xyz test xyz thanks"),
+ (`([Hh])ello`, "Hello world and hello Hare.", `\howdy\`,
+ `\howdy\ world and \howdy\ Hare.`),
+ (`fo{2,}`, "fo foo fooofoof oofoo", `\0bar`,
+ `fo \0bar \0bar\0barf oo\0bar`),
+ (`\\\\`, `\\\\\\\\`, `\00\1`,
+ `\00\1\00\1\00\1\00\1`),
+ ];
+
+ for (let i = 0z; i < len(cases); i += 1) {
+ const expr = cases[i].0;
+ const string = cases[i].1;
+ const target = cases[i].2;
+ const expected = cases[i].3;
+ run_rawreplace_case(expr, string, target, expected);
+ };
+};
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -823,6 +823,118 @@ export fn findall(re: *regex, string: str) []result = {
return res;
};
+// Replaces all non-overlapping matches of a regular expression against a string
+// with 'targetstr'.
+//
+// A backslash followed by a single decimal number within 'targetstr' is
+// replaced by the capture at that index (starting at 1), or an empty string if
+// no such capture exists. For example, `\1` is replaced with the first capture,
+// `\2` with the second, etc. `\0` is substituted with the entire substring that
+// was matched. `\\` is replaced with a literal backslash.
+//
+// An error is only returned if 'targetstr' isn't formatted correctly.
+export fn replace(re: *regex, string: str, targetstr: str) (str | error) = {
+ const matches = findall(re, string);
+ if (len(matches) == 0) {
+ return strings::dup(string);
+ };
+ defer result_freeall(matches);
+
+ const target = parse_replace_target(targetstr)?;
+ defer free(target);
+
+ const bytes = strings::toutf8(string);
+ let buf = alloc(bytes[..matches[0][0].start_bytesize]...);
+
+ for (let i = 0z; i < len(matches); i += 1) {
+ for (let j = 0z; j < len(target); j += 1) {
+ match (target[j]) {
+ case let b: []u8 =>
+ append(buf, b...);
+ case let z: size =>
+ if (z >= len(matches[i])) yield;
+ const b = strings::toutf8(matches[i][z].content);
+ append(buf, b...);
+ };
+ };
+ const start = matches[i][0].end_bytesize;
+ const end = if (i == len(matches) - 1) len(bytes)
+ else matches[i + 1][0].start_bytesize;
+ append(buf, bytes[start..end]...);
+ };
+
+ return strings::fromutf8(buf)!;
+};
+
+fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = {
+ const bytes = strings::toutf8(targetstr);
+ let target: []([]u8 | size) = alloc([], 1);
+ let iter = strings::iter(targetstr);
+ let start = 0z, end = 0z;
+ for (true) match (strings::next(&iter)) {
+ case void =>
+ if (start != end) {
+ append(target, bytes[start..]);
+ };
+ break;
+ case let r: rune =>
+ if (r == '\\') {
+ if (start != end) {
+ append(target, bytes[start..end]);
+ };
+
+ const r = match (strings::next(&iter)) {
+ case void =>
+ return "Trailing backslash": error;
+ case let r: rune =>
+ yield r;
+ };
+
+ if (r == '\\') {
+ append(target, '\\');
+ } else if (ascii::isdigit(r)) {
+ append(target, r: u32: size - 0x30);
+ } else {
+ return "Backslash must be followed by positive decimal number or a backslash": error;
+ };
+
+ end += 2;
+ start = end;
+ } else {
+ end += utf8::runesz(r);
+ };
+ };
+
+ return target;
+};
+
+// Replaces all non-overlapping matches of a regular expression against a string
+// with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all
+// backslashes are treated literally.
+export fn rawreplace(re: *regex, string: str, targetstr: str) str = {
+ const matches = findall(re, string);
+ if (len(matches) == 0) {
+ return strings::dup(string);
+ };
+ defer result_freeall(matches);
+
+ const target = strings::toutf8(targetstr);
+ const bytes = strings::toutf8(string);
+ let buf: []u8 = [];
+
+ append(buf, bytes[..matches[0][0].start_bytesize]...);
+ for (let i = 1z; i < len(matches); i += 1) {
+ append(buf, target...);
+ const start = matches[i - 1][0].end_bytesize;
+ const end = matches[i][0].start_bytesize;
+ append(buf, bytes[start..end]...);
+ };
+ append(buf, target...);
+ append(buf, bytes[matches[len(matches) - 1][0].end_bytesize..]...);
+
+ return strings::fromutf8(buf)!;
+};
+
// Frees a [[result]].
export fn result_free(s: result) void = {
free(s);