hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit e62e47b5de35c8dde6dd2e917f54d3118d68c6dc
parent 798e7b63d399ae347bf85bd512581cfc563bfc5e
Author: Sebastian <sebastian@sebsite.pw>
Date:   Sun, 23 Apr 2023 02:41:35 -0400

regex: add replace and rawreplace

Implements: https://todo.sr.ht/~sircmpwn/hare/710
Signed-off-by: Sebastian <sebastian@sebsite.pw>

Diffstat:
Mregex/+test.ha | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mregex/regex.ha | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 222 insertions(+), 0 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -133,6 +133,71 @@ fn run_findall_case( }; }; +fn run_replace_case( + expr: str, + string: str, + target: str, + expected: (str | void), +) void = { + const re = match (compile(expr)) { + case let re: regex => yield re; + case let e: error => + fmt::errorln(e)!; + fmt::errorfln("Expected expression /{}/ to compile, but it errored", + expr)!; + abort(); + }; + defer finish(&re); + + match (replace(&re, string, target)) { + case let e: error => + if (expected is str) { + fmt::errorln(e)!; + fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\"", + expr, string, target, expected as str)!; + abort(); + }; + case let s: str => + defer free(s); + if (expected is void) { + fmt::errorln("Expected replace to fail, but it did not")!; + fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" return=\"{}\"", + expr, string, target, s)!; + abort(); + }; + if (expected as str != s) { + fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\" return=\"{}\"", + expr, string, target, expected as str, s)!; + abort(); + }; + }; +}; + +fn run_rawreplace_case( + expr: str, + string: str, + target: str, + expected: str, +) void = { + const re = match (compile(expr)) { + case let re: regex => yield re; + case let e: error => + fmt::errorln(e)!; + fmt::errorfln("Expected expression /{}/ to compile, but it errored", + expr)!; + abort(); + }; + defer finish(&re); + + const s = rawreplace(&re, string, target); + defer free(s); + if (expected != s) { + fmt::errorfln("expr=/{}/ string=\"{}\" target=\"{}\" expected=\"{}\" return=\"{}\"", + expr, string, target, expected, s)!; + abort(); + }; +}; + @test fn find() void = { const cases = [ // literals @@ -605,3 +670,48 @@ fn run_findall_case( run_findall_case(expr, string, should_match, targets); }; }; + +@test fn replace() void = { + const cases: [_](str, str, str, (str | void)) = [ + (`ab.`, "hello abc and abあ test abq thanks", `xyz`, + "hello xyz and xyz test xyz thanks"), + (`([Hh])ello`, "Hello world and hello Hare.", `\1owdy`, + "Howdy world and howdy Hare."), + (`fo{2,}`, "fo foo fooofoof oofoo", `\0bar`, + "fo foobar fooobarfoobarf oofoobar"), + (`(1)(2)(3)(4)(5)(6)(7)(8)(9)(10)`, "12345678910", `\10`, + "10"), + (`...?`, "abcdefgh", `\7\0\8`, + "abcdefgh"), + (`...?`, "abcdefgh", `\7\0\`, void), + ]; + + for (let i = 0z; i < len(cases); i += 1) { + const expr = cases[i].0; + const string = cases[i].1; + const target = cases[i].2; + const expected = cases[i].3; + run_replace_case(expr, string, target, expected); + }; +}; + +@test fn rawreplace() void = { + const cases = [ + (`ab.`, "hello abc and abあ test abq thanks", "xyz", + "hello xyz and xyz test xyz thanks"), + (`([Hh])ello`, "Hello world and hello Hare.", `\howdy\`, + `\howdy\ world and \howdy\ Hare.`), + (`fo{2,}`, "fo foo fooofoof oofoo", `\0bar`, + `fo \0bar \0bar\0barf oo\0bar`), + (`\\\\`, `\\\\\\\\`, `\00\1`, + `\00\1\00\1\00\1\00\1`), + ]; + + for (let i = 0z; i < len(cases); i += 1) { + const expr = cases[i].0; + const string = cases[i].1; + const target = cases[i].2; + const expected = cases[i].3; + run_rawreplace_case(expr, string, target, expected); + }; +}; diff --git a/regex/regex.ha b/regex/regex.ha @@ -823,6 +823,118 @@ export fn findall(re: *regex, string: str) []result = { return res; }; +// Replaces all non-overlapping matches of a regular expression against a string +// with 'targetstr'. +// +// A backslash followed by a single decimal number within 'targetstr' is +// replaced by the capture at that index (starting at 1), or an empty string if +// no such capture exists. For example, `\1` is replaced with the first capture, +// `\2` with the second, etc. `\0` is substituted with the entire substring that +// was matched. `\\` is replaced with a literal backslash. +// +// An error is only returned if 'targetstr' isn't formatted correctly. +export fn replace(re: *regex, string: str, targetstr: str) (str | error) = { + const matches = findall(re, string); + if (len(matches) == 0) { + return strings::dup(string); + }; + defer result_freeall(matches); + + const target = parse_replace_target(targetstr)?; + defer free(target); + + const bytes = strings::toutf8(string); + let buf = alloc(bytes[..matches[0][0].start_bytesize]...); + + for (let i = 0z; i < len(matches); i += 1) { + for (let j = 0z; j < len(target); j += 1) { + match (target[j]) { + case let b: []u8 => + append(buf, b...); + case let z: size => + if (z >= len(matches[i])) yield; + const b = strings::toutf8(matches[i][z].content); + append(buf, b...); + }; + }; + const start = matches[i][0].end_bytesize; + const end = if (i == len(matches) - 1) len(bytes) + else matches[i + 1][0].start_bytesize; + append(buf, bytes[start..end]...); + }; + + return strings::fromutf8(buf)!; +}; + +fn parse_replace_target(targetstr: str) ([]([]u8 | size) | error) = { + const bytes = strings::toutf8(targetstr); + let target: []([]u8 | size) = alloc([], 1); + let iter = strings::iter(targetstr); + let start = 0z, end = 0z; + for (true) match (strings::next(&iter)) { + case void => + if (start != end) { + append(target, bytes[start..]); + }; + break; + case let r: rune => + if (r == '\\') { + if (start != end) { + append(target, bytes[start..end]); + }; + + const r = match (strings::next(&iter)) { + case void => + return "Trailing backslash": error; + case let r: rune => + yield r; + }; + + if (r == '\\') { + append(target, '\\'); + } else if (ascii::isdigit(r)) { + append(target, r: u32: size - 0x30); + } else { + return "Backslash must be followed by positive decimal number or a backslash": error; + }; + + end += 2; + start = end; + } else { + end += utf8::runesz(r); + }; + }; + + return target; +}; + +// Replaces all non-overlapping matches of a regular expression against a string +// with 'targetstr'. 'targetstr' is isn't interpreted in any special way; all +// backslashes are treated literally. +export fn rawreplace(re: *regex, string: str, targetstr: str) str = { + const matches = findall(re, string); + if (len(matches) == 0) { + return strings::dup(string); + }; + defer result_freeall(matches); + + const target = strings::toutf8(targetstr); + const bytes = strings::toutf8(string); + let buf: []u8 = []; + + append(buf, bytes[..matches[0][0].start_bytesize]...); + for (let i = 1z; i < len(matches); i += 1) { + append(buf, target...); + const start = matches[i - 1][0].end_bytesize; + const end = matches[i][0].start_bytesize; + append(buf, bytes[start..end]...); + }; + append(buf, target...); + append(buf, bytes[matches[len(matches) - 1][0].end_bytesize..]...); + + return strings::fromutf8(buf)!; +}; + // Frees a [[result]]. export fn result_free(s: result) void = { free(s);