commit 79e76b127206608c7c55f6b791bcd9844c9bdcdd
parent b3d613259690b701f50e2a03384d59304bc16bd7
Author: Vlad-Stefan Harbuz <vlad@vladh.net>
Date: Fri, 8 Apr 2022 14:03:42 +0200
add regex
Signed-off-by: Vlad-Stefan Harbuz <vlad@vladh.net>
Diffstat:
A | regex/+test.ha | | | 598 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
A | regex/README | | | 41 | +++++++++++++++++++++++++++++++++++++++++ |
A | regex/regex.ha | | | 813 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
M | scripts/gen-stdlib | | | 11 | +++++++++++ |
M | stdlib.mk | | | 33 | +++++++++++++++++++++++++++++++++ |
5 files changed, 1496 insertions(+), 0 deletions(-)
diff --git a/regex/+test.ha b/regex/+test.ha
@@ -0,0 +1,598 @@
+// License: MPL-2.0
+// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
+use fmt;
+use io;
+use os;
+use strings;
+
+def PERFTEST_ON: bool = false;
+type matchres = enum { MATCH, NOMATCH, ERROR };
+
+fn run_find_case(
+ expr: str,
+ string: str,
+ expected: matchres,
+ start: int,
+ end: int
+) void = {
+ const re = match (compile(expr)) {
+ case let re: regex => yield re;
+ case let e: error =>
+ if (expected == matchres::MATCH) {
+ fmt::println(e)!;
+ fmt::fatal("Expected expression /{}/ to match, but it errored",
+ expr, string);
+ };
+ if (expected == matchres::NOMATCH) {
+ fmt::println(e)!;
+ fmt::fatal("Expected expression /{}/ to not match, but it errored",
+ expr, string);
+ };
+ return;
+ };
+
+ match (find(re, string)) {
+ case void =>
+ if (expected == matchres::MATCH) {
+ fmt::fatal("Expected expression /{}/ to match string \"{}\", but it did not",
+ expr, string);
+ };
+ if (expected == matchres::ERROR) {
+ fmt::fatal("Expression /{}/ failed to match, but should have errored",
+ expr, string);
+ };
+
+ case let m: []matchgroup =>
+ if (expected == matchres::NOMATCH) {
+ fmt::fatal("Expected expression /{}/ to not match string \"{}\", but it did",
+ expr, string);
+ };
+ if (expected == matchres::ERROR) {
+ fmt::fatal("Expression /{}/ matched, but should have errored",
+ expr, string);
+ };
+ if (start: size != m[0].start) {
+ fmt::fatal("Expected start of main match group to be {} but it was {}",
+ start, m[0].start);
+ };
+ if (end: size != m[0].end) {
+ fmt::fatal("Expected end of main match group to be {} but it was {}",
+ end, m[0].end);
+ };
+
+ case let e: error =>
+ if (expected == matchres::MATCH) {
+ fmt::fatal("Expected expression /{}/ to match, but it errored",
+ expr, string);
+ };
+ if (expected == matchres::NOMATCH) {
+ fmt::fatal("Expected expression /{}/ to not match, but it errored",
+ expr, string);
+ };
+ };
+};
+
+fn run_findall_case(
+ expr: str,
+ string: str,
+ expected: matchres,
+ count: int
+) void = {
+ const re = match (compile(expr)) {
+ case let re: regex => yield re;
+ case let e: error =>
+ if (expected == matchres::MATCH) {
+ fmt::println(e)!;
+ fmt::fatal("Expected expression /{}/ to match, but it errored",
+ expr, string);
+ };
+ if (expected == matchres::NOMATCH) {
+ fmt::println(e)!;
+ fmt::fatal("Expected expression /{}/ to not match, but it errored",
+ expr, string);
+ };
+ return;
+ };
+
+ match (findall(re, string)) {
+ case void =>
+ if (expected == matchres::MATCH) {
+ fmt::fatal("Expected expression /{}/ to match string \"{}\", but it did not",
+ expr, string);
+ };
+ if (expected == matchres::ERROR) {
+ fmt::fatal("Expression /{}/ failed to match, but should have errored",
+ expr, string);
+ };
+
+ case let groupsets: [][]matchgroup =>
+ if (expected == matchres::NOMATCH) {
+ fmt::fatal("Expected expression /{}/ to not match string \"{}\", but it did",
+ expr, string);
+ };
+ if (expected == matchres::ERROR) {
+ fmt::fatal("Expression /{}/ matched, but should have errored",
+ expr, string);
+ };
+ if (count: size != len(groupsets)) {
+ fmt::fatal("Expected to find {} matches but found {}",
+ count, len(groupsets));
+ };
+
+ case let e: error =>
+ if (expected == matchres::MATCH) {
+ fmt::fatal("Expected expression /{}/ to match, but it errored",
+ expr, string);
+ };
+ if (expected == matchres::NOMATCH) {
+ fmt::fatal("Expected expression /{}/ to not match, but it errored",
+ expr, string);
+ };
+ };
+};
+
+@test fn find() void = {
+ const cases = [
+ // literals
+ (`^$`, "", matchres::MATCH, 0, 0),
+ (``, "", matchres::MATCH, 0, -1),
+ (`abcd`, "abcd", matchres::MATCH, 0, -1),
+ (`abc`, "abcd", matchres::MATCH, 0, 3),
+ (`bcd`, "abcd", matchres::MATCH, 1, 4),
+ (`^abc$`, "abc", matchres::MATCH, 0, -1),
+ (`^abc$`, "axc", matchres::NOMATCH, 0, -1),
+ // .
+ (`^.$`, "x", matchres::MATCH, 0, 1),
+ (`^.$`, "y", matchres::MATCH, 0, 1),
+ (`^.$`, "", matchres::NOMATCH, 0, 1),
+ // +
+ (`^a+$`, "a", matchres::MATCH, 0, 1),
+ (`^a+$`, "aaa", matchres::MATCH, 0, 3),
+ (`^a+$`, "", matchres::NOMATCH, 0, 0),
+ (`^(abc)+$`, "abc", matchres::MATCH, 0, 3),
+ (`^(abc)+$`, "abcabc", matchres::MATCH, 0, 6),
+ (`^(abc)+$`, "", matchres::NOMATCH, 0, 0),
+ // *
+ (`^a*$`, "", matchres::MATCH, 0, 0),
+ (`^a*$`, "aaaa", matchres::MATCH, 0, 4),
+ (`^a*$`, "b", matchres::NOMATCH, 0, 0),
+ (`^(abc)*$`, "", matchres::MATCH, 0, 0),
+ (`^(abc)*$`, "abc", matchres::MATCH, 0, 3),
+ (`^(abc)*$`, "abcabc", matchres::MATCH, 0, 6),
+ (`^(abc)*$`, "bbb", matchres::NOMATCH, 0, 3),
+ // ?
+ (`^a?$`, "", matchres::MATCH, 0, 0),
+ (`^a?$`, "a", matchres::MATCH, 0, 1),
+ (`^a?$`, "b", matchres::NOMATCH, 0, 0),
+ (`^(abc)?$`, "", matchres::MATCH, 0, 0),
+ (`^(abc)?$`, "abc", matchres::MATCH, 0, 3),
+ (`^(abc)?$`, "bbb", matchres::NOMATCH, 0, 0),
+ // ^ and $
+ (`^a*`, "aaaa", matchres::MATCH, 0, 4),
+ (`a*$`, "aaaa", matchres::MATCH, 0, 4),
+ (`^a*$`, "aaaa", matchres::MATCH, 0, 4),
+ (`a*`, "aaaa", matchres::MATCH, 0, 4),
+ (`b*`, "aaaabbbb", matchres::MATCH, 4, 8),
+ (`^b*`, "aaaabbbb", matchres::MATCH, 0, 0),
+ (`b*$`, "aaaabbbb", matchres::MATCH, 4, 8),
+ // (a|b)
+ (`^(cafe|b)x$`, "cafex", matchres::MATCH, 0, 5),
+ (`^(cafe|b)x$`, "bx", matchres::MATCH, 0, 2),
+ (`^(cafe|b)x$`, "XXXx", matchres::NOMATCH, 0, 0),
+ (`^(cafe|b)x$`, "bx", matchres::MATCH, 0, 2),
+ (
+ `^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`,
+ "Jagdhaftpflichtversicherungsbetrag",
+ matchres::MATCH, 0, -1
+ ),
+ (
+ `^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`,
+ "Jagdhaftpflichtversicherungsbetrug",
+ matchres::NOMATCH, 0, -1
+ ),
+ (
+ `^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`,
+ "Jagdversicherungspolice",
+ matchres::NOMATCH, 0, -1
+ ),
+ (`)`, "", matchres::ERROR, 0, 0),
+ // [abc]
+ (`^test[abc]$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[abc]$`, "testb", matchres::MATCH, 0, -1),
+ (`^test[abc]$`, "testc", matchres::MATCH, 0, -1),
+ (`^test[abc]$`, "testd", matchres::NOMATCH, 0, -1),
+ (`^test[abc]*$`, "test", matchres::MATCH, 0, -1),
+ (`^test[abc]*$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[abc]*$`, "testaaa", matchres::MATCH, 0, -1),
+ (`^test[abc]*$`, "testabc", matchres::MATCH, 0, -1),
+ (`^test[abc]?$`, "test", matchres::MATCH, 0, -1),
+ (`^test[abc]?$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[abc]+$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[abc]+$`, "test", matchres::NOMATCH, 0, -1),
+ (`^test[]abc]$`, "test]", matchres::MATCH, 0, -1),
+ (`^test[[abc]$`, "test[", matchres::MATCH, 0, -1),
+ (`^test[^abc]$`, "testd", matchres::MATCH, 0, -1),
+ (`^test[^abc]$`, "test!", matchres::MATCH, 0, -1),
+ (`^test[^abc]$`, "testa", matchres::NOMATCH, 0, -1),
+ (`^test[^abc]$`, "testb", matchres::NOMATCH, 0, -1),
+ (`^test[^abc]$`, "testc", matchres::NOMATCH, 0, -1),
+ (`^test[^]abc]$`, "test]", matchres::NOMATCH, 0, -1),
+ (`^test[^abc[]$`, "test[", matchres::NOMATCH, 0, -1),
+ (`^test[^abc]*$`, "testd", matchres::MATCH, 0, -1),
+ (`^test[^abc]*$`, "testqqqqq", matchres::MATCH, 0, -1),
+ (`^test[^abc]*$`, "test", matchres::MATCH, 0, -1),
+ (`^test[^abc]*$`, "testc", matchres::NOMATCH, 0, -1),
+ (`^test[^abc]?$`, "test", matchres::MATCH, 0, -1),
+ (`^test[^abc]?$`, "testd", matchres::MATCH, 0, -1),
+ (`^test[^abc]?$`, "testc", matchres::NOMATCH, 0, -1),
+ (`^test[^abc]+$`, "testd", matchres::MATCH, 0, -1),
+ (`^test[^abc]+$`, "testddd", matchres::MATCH, 0, -1),
+ (`^test[^abc]+$`, "testc", matchres::NOMATCH, 0, -1),
+ (`^test[^abc]+$`, "testcccc", matchres::NOMATCH, 0, -1),
+ (`^test[a-c]$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[a-c]$`, "testb", matchres::MATCH, 0, -1),
+ (`^test[a-c]$`, "testc", matchres::MATCH, 0, -1),
+ (`^test[a-c]$`, "testd", matchres::NOMATCH, 0, -1),
+ (`^test[a-c]$`, "test!", matchres::NOMATCH, 0, -1),
+ (`^test[a-c]$`, "test-", matchres::NOMATCH, 0, -1),
+ (`^test[-a-c]$`, "test-", matchres::MATCH, 0, -1),
+ (`^test[a-c-]$`, "test-", matchres::MATCH, 0, -1),
+ (`^test[a-c]*$`, "test", matchres::MATCH, 0, -1),
+ (`^test[a-c]*$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[a-c]*$`, "testabb", matchres::MATCH, 0, -1),
+ (`^test[a-c]*$`, "testddd", matchres::NOMATCH, 0, -1),
+ (`^test[a-c]?$`, "test", matchres::MATCH, 0, -1),
+ (`^test[a-c]?$`, "testb", matchres::MATCH, 0, -1),
+ (`^test[a-c]?$`, "testd", matchres::NOMATCH, 0, -1),
+ (`^test[a-c]+$`, "test", matchres::NOMATCH, 0, -1),
+ (`^test[a-c]+$`, "testbcbc", matchres::MATCH, 0, -1),
+ (`^test[a-c]+$`, "testd", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c]$`, "testa", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c]$`, "testb", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c]$`, "testc", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c]$`, "testd", matchres::MATCH, 0, -1),
+ (`^test[^a-c]$`, "test!", matchres::MATCH, 0, -1),
+ (`^test[^a-c]$`, "test-", matchres::MATCH, 0, -1),
+ (`^test[^-a-c]$`, "test-", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c-]$`, "test-", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c-]*$`, "test", matchres::MATCH, 0, -1),
+ (`^test[^a-c-]*$`, "test--", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c-]*$`, "testq", matchres::MATCH, 0, -1),
+ (`^test[^a-c-]?$`, "test", matchres::MATCH, 0, -1),
+ (`^test[^a-c-]?$`, "testq", matchres::MATCH, 0, -1),
+ (`^test[^a-c-]?$`, "test-", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c-]+$`, "test", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c-]+$`, "testb", matchres::NOMATCH, 0, -1),
+ (`^test[^a-c-]+$`, "testddd", matchres::MATCH, 0, -1),
+ (`([a-z][a-z0-9]*,)+`, "a5,b7,c9,", matchres::MATCH, 0, -1),
+ // [:alpha:] etc.
+ (`^test[[:alnum:]]+$`, "testaA1", matchres::MATCH, 0, -1),
+ (`^test[[:alnum:]]+$`, "testa_1", matchres::NOMATCH, 0, -1),
+ (`^test[[:alpha:]]+$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[[:alpha:]]+$`, "testa1", matchres::NOMATCH, 0, -1),
+ (`^test[[:blank:]]+$`, "testa", matchres::NOMATCH, 0, -1),
+ (`^test[[:blank:]]+$`, "test ", matchres::MATCH, 0, -1),
+ (`^test[^[:blank:]]+$`, "testx", matchres::MATCH, 0, -1),
+ (`^test[[:blank:]]+$`, "test ", matchres::MATCH, 0, -1),
+ (`^test[^[:cntrl:]]+$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[[:digit:]]$`, "test1", matchres::MATCH, 0, -1),
+ (`^test[[:digit:]]$`, "testa", matchres::NOMATCH, 0, -1),
+ (`^test[[:graph:]]+$`, "test\t", matchres::NOMATCH, 0, -1),
+ (`^test[[:lower:]]+$`, "testa", matchres::MATCH, 0, -1),
+ (`^test[[:lower:]]+$`, "testA", matchres::NOMATCH, 0, -1),
+ (`^test[[:print:]]+$`, "test\t", matchres::NOMATCH, 0, -1),
+ (`^test[[:punct:]]+$`, "testA", matchres::NOMATCH, 0, -1),
+ (`^test[[:punct:]]+$`, "test!", matchres::MATCH, 0, -1),
+ (`^test[[:space:]]+$`, "test ", matchres::MATCH, 0, -1),
+ (`^test[[:upper:]]+$`, "testa", matchres::NOMATCH, 0, -1),
+ (`^test[[:upper:]]+$`, "testA", matchres::MATCH, 0, -1),
+ (`^test[[:word:]]+$`, "test!2", matchres::NOMATCH, 0, -1),
+ (`^test[[:word:]]+$`, "test_2", matchres::MATCH, 0, -1),
+ (`^test[[:xdigit:]]+$`, "testCAFE", matchres::MATCH, 0, -1),
+ // [:alpha:] etc. plus extra characters
+ (`^test[[:digit:]][[:alpha:]]$`, "test1a", matchres::MATCH, 0, -1),
+ (`^test[[:digit:]][[:alpha:]]$`, "testa1", matchres::NOMATCH, 0, -1),
+ (`^test[[:alnum:]!]+$`, "testa!1", matchres::MATCH, 0, -1),
+ (`^test[@[:alnum:]!]+$`, "testa!@1", matchres::MATCH, 0, -1),
+ // Escaped characters such as \+
+ (`^a\+b$`, "a+b", matchres::MATCH, 0, -1),
+ (`^a\?b$`, "a?b", matchres::MATCH, 0, -1),
+ (`^a\*b$`, "a*b", matchres::MATCH, 0, -1),
+ (`^a\^b$`, "a^b", matchres::MATCH, 0, -1),
+ (`^a\$b$`, "a$b", matchres::MATCH, 0, -1),
+ (`^a\[b$`, "a[b", matchres::MATCH, 0, -1),
+ (`^a\]b$`, "a]b", matchres::MATCH, 0, -1),
+ (`^a\(b$`, "a(b", matchres::MATCH, 0, -1),
+ (`^a\)b$`, "a)b", matchres::MATCH, 0, -1),
+ (`^a\|b$`, "a|b", matchres::MATCH, 0, -1),
+ (`^a\.b$`, "a.b", matchres::MATCH, 0, -1),
+ (`^a\\b$`, "a\\b", matchres::MATCH, 0, -1),
+ // {m,n}
+ (`^x(abc){1,2}$`, "xabc", matchres::MATCH, 0, -1),
+ (`^x(abc){1,2}$`, "xabcabc", matchres::MATCH, 0, -1),
+ (`^x(abc){1,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1),
+ (`^x(abc){,2}$`, "xabc", matchres::MATCH, 0, -1),
+ (`^x(abc){,2}$`, "xabcabc", matchres::MATCH, 0, -1),
+ (`^x(abc){,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1),
+ (`^x(abc){1,}$`, "xabc", matchres::MATCH, 0, -1),
+ (`^x(abc){1,}$`, "xabcabc", matchres::MATCH, 0, -1),
+ (`^x(abc){3,}$`, "xabcabc", matchres::NOMATCH, 0, -1),
+ (`^x(abc){3,}$`, "xabcabcabc", matchres::MATCH, 0, -1),
+ (`^x(abc){2,2}$`, "xabcabc", matchres::MATCH, 0, -1),
+ (`^x(abc){2,2}$`, "xabc", matchres::NOMATCH, 0, -1),
+ (`^x(abc){2,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1),
+ (`^x(abc){-1,2}$`, "xabcabcabc", matchres::ERROR, 0, -1),
+ (`^x(abc){x,2}$`, "xabcabcabc", matchres::ERROR, 0, -1),
+ (`^x(abc){0,-2}$`, "xabcabcabc", matchres::ERROR, 0, -1),
+ // various
+ (
+ `^.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+$`,
+ "X1024facefacecaaaaafffcafebabebabe",
+ matchres::MATCH, 0, -1,
+ ),
+ (
+ `.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+`,
+ "X1024facefacecaaaaafffcafebabebabe",
+ matchres::MATCH, 0, -1,
+ ),
+ (
+ `^.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+$`,
+ "1024facefacecaaaaafffcafebabebabe",
+ matchres::NOMATCH, 0, 0,
+ ),
+ (
+ `.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+`,
+ "1024facefacecaaaaafffcafebabebabe",
+ matchres::MATCH, 3, -1,
+ ),
+ (
+ `^([a-zA-Z]{1,2}[[:digit:]]{1,2})[[:space:]]*([[:digit:]][a-zA-Z]{2})$`,
+ "M15 4QN",
+ matchres::MATCH, 0, -1
+ ),
+ // tests from perl
+ (`abc`, "abc", matchres::MATCH, 0, -1),
+ (`abc`, "xbc", matchres::NOMATCH, 0, 0),
+ (`abc`, "axc", matchres::NOMATCH, 0, 0),
+ (`abc`, "abx", matchres::NOMATCH, 0, 0),
+ (`abc`, "xabcy", matchres::MATCH, 1, 4),
+ (`abc`, "ababc", matchres::MATCH, 2, -1),
+ (`ab*c`, "abc", matchres::MATCH, 0, -1),
+ (`ab*bc`, "abc", matchres::MATCH, 0, -1),
+ (`ab*bc`, "abbc", matchres::MATCH, 0, -1),
+ (`ab*bc`, "abbbbc", matchres::MATCH, 0, -1),
+ (`ab{0,}bc`, "abbbbc", matchres::MATCH, 0, -1),
+ (`ab+bc`, "abbc", matchres::MATCH, 0, -1),
+ (`ab+bc`, "abc", matchres::NOMATCH, 0, 0),
+ (`ab+bc`, "abq", matchres::NOMATCH, 0, 0),
+ (`ab{1,}bc`, "abq", matchres::NOMATCH, 0, 0),
+ (`ab+bc`, "abbbbc", matchres::MATCH, 0, -1),
+ (`ab{1,}bc`, "abbbbc", matchres::MATCH, 0, -1),
+ (`ab{1,3}bc`, "abbbbc", matchres::MATCH, 0, -1),
+ (`ab{3,4}bc`, "abbbbc", matchres::MATCH, 0, -1),
+ (`ab{4,5}bc`, "abbbbc", matchres::NOMATCH, 0, 0),
+ (`ab?bc`, "abbc", matchres::MATCH, 0, -1),
+ (`ab?bc`, "abc", matchres::MATCH, 0, -1),
+ (`ab{0,1}bc`, "abc", matchres::MATCH, 0, -1),
+ (`ab?bc`, "abbbbc", matchres::NOMATCH, 0, 0),
+ (`ab?c`, "abc", matchres::MATCH, 0, -1),
+ (`ab{0,1}c`, "abc", matchres::MATCH, 0, -1),
+ (`^abc$`, "abc", matchres::MATCH, 0, -1),
+ (`^abc$`, "abcc", matchres::NOMATCH, 0, 0),
+ (`^abc`, "abcc", matchres::MATCH, 0, 3),
+ (`^abc$`, "aabc", matchres::NOMATCH, 0, 0),
+ (`abc$`, "aabc", matchres::MATCH, 1, -1),
+ (`^`, "abc", matchres::MATCH, 0, 0),
+ (`$`, "abc", matchres::MATCH, 3, 3),
+ (`a.c`, "abc", matchres::MATCH, 0, -1),
+ (`a.c`, "axc", matchres::MATCH, 0, -1),
+ (`a.*c`, "axyzc", matchres::MATCH, 0, -1),
+ (`a.*c`, "axyzd", matchres::NOMATCH, 0, 0),
+ (`a[bc]d`, "abc", matchres::NOMATCH, 0, 0),
+ (`a[bc]d`, "abd", matchres::MATCH, 0, -1),
+ (`a[b-d]e`, "abd", matchres::NOMATCH, 0, 0),
+ (`a[b-d]e`, "ace", matchres::MATCH, 0, -1),
+ (`a[b-d]`, "aac", matchres::MATCH, 1, -1),
+ (`a[-b]`, "a-", matchres::MATCH, 0, -1),
+ (`a[b-]`, "a-", matchres::MATCH, 0, -1),
+ (`a[b-a]`, "-", matchres::ERROR, 0, 0),
+ (`a[]b`, "-", matchres::ERROR, 0, 0),
+ (`a[`, "-", matchres::ERROR, 0, 0),
+ (`a]`, "a]", matchres::MATCH, 0, -1),
+ (`a[]]b`, "a]b", matchres::MATCH, 0, -1),
+ (`a[^bc]d`, "aed", matchres::MATCH, 0, -1),
+ (`a[^bc]d`, "abd", matchres::NOMATCH, 0, 0),
+ (`a[^-b]c`, "adc", matchres::MATCH, 0, -1),
+ (`a[^-b]c`, "a-c", matchres::NOMATCH, 0, 0),
+ (`a[^]b]c`, "a]c", matchres::NOMATCH, 0, 0),
+ (`a[^]b]c`, "adc", matchres::MATCH, 0, -1),
+ (`()ef`, "def", matchres::MATCH, 1, -1),
+ (`*a`, "-", matchres::ERROR, 0, 0),
+ (`(*)b`, "-", matchres::ERROR, 0, 0),
+ (`$b`, "b", matchres::ERROR, 0, 0),
+ (`a\`, "-", matchres::ERROR, 0, 0),
+ (`a\(b`, "a(b", matchres::MATCH, 0, -1),
+ (`a\(*b`, "ab", matchres::MATCH, 0, -1),
+ (`a\(*b`, "a((b", matchres::MATCH, 0, -1),
+ (`a\\b`, `a\b`, matchres::MATCH, 0, -1),
+ (`abc)`, "-", matchres::ERROR, 0, 0),
+ (`(abc`, "-", matchres::ERROR, 0, 0),
+ (`(a)b(c)`, "abc", matchres::MATCH, 0, -1),
+ (`a+b+c`, "aabbabc", matchres::MATCH, 4, -1),
+ (`a{1,}b{1,}c`, "aabbabc", matchres::MATCH, 4, -1),
+ (`a**`, "-", matchres::ERROR, 0, 0),
+ (`)(`, "-", matchres::ERROR, 0, 0),
+ (`[^ab]*`, "cde", matchres::MATCH, 0, -1),
+ (`abc`, "", matchres::NOMATCH, 0, 0),
+ (`a*`, "", matchres::MATCH, 0, -1),
+ (`([abc])*d`, "abbbcd", matchres::MATCH, 0, -1),
+ (`([abc])*bcd`, "abcd", matchres::MATCH, 0, -1),
+ (`abcd*efg`, "abcdefg", matchres::MATCH, 0, -1),
+ (`ab*`, "xabyabbbz", matchres::MATCH, 1, 3),
+ (`ab*`, "xayabbbz", matchres::MATCH, 1, 2),
+ (`(ab|cd)e`, "abcde", matchres::MATCH, 2, -1),
+ (`[abhgefdc]ij`, "hij", matchres::MATCH, 0, -1),
+ (`^(ab|cd)e`, "abcde", matchres::NOMATCH, 0, 0),
+ (`(abc|)ef`, "abcdef", matchres::MATCH, 4, -1),
+ (`(a|b)c*d`, "abcd", matchres::MATCH, 1, -1),
+ (`(ab|ab*)bc`, "abc", matchres::MATCH, 0, -1),
+ (`a([bc]*)c*`, "abc", matchres::MATCH, 0, -1),
+ (`a([bc]*)(c*d)`, "abcd", matchres::MATCH, 0, -1),
+ (`a([bc]+)(c*d)`, "abcd", matchres::MATCH, 0, -1),
+ (`a([bc]*)(c+d)`, "abcd", matchres::MATCH, 0, -1),
+ (`a[bcd]*dcdcde`, "adcdcde", matchres::MATCH, 0, -1),
+ (`a[bcd]+dcdcde`, "adcdcde", matchres::NOMATCH, 0, 0),
+ (`(ab|a)b*c`, "abc", matchres::MATCH, 0, -1),
+ (`[a-zA-Z_][a-zA-Z0-9_]*`, "alpha", matchres::MATCH, 0, -1),
+ (`^a(bc+|b[eh])g|.h$`, "abh", matchres::MATCH, 0, -1),
+ (`multiple words of text`, "uh-uh", matchres::NOMATCH, 0, 0),
+ (`multiple words`, "multiple words, yeah", matchres::MATCH, 0, 14),
+ (`(.*)c(.*)`, "abcde", matchres::MATCH, 0, -1),
+ (`\((.*), (.*)\)`, "(a, b)", matchres::MATCH, 0, -1),
+ (`[k]`, "ab", matchres::NOMATCH, 0, 0),
+ (`a[-]?c`, "ac", matchres::MATCH, 0, -1),
+ (`.*d`, "abc\nabd", matchres::MATCH, 0, -1),
+ (`(`, "", matchres::ERROR, 0, 0),
+ (`(x?)?`, "x", matchres::MATCH, 0, -1),
+ (`^*`, "", matchres::ERROR, 0, 0),
+ // Submatch handling
+ (`(a|ab)(c|bcd)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(a|ab)(bcd|c)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(ab|a)(c|bcd)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(ab|a)(bcd|c)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(a*)(b|abc)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+ (`(a*)(abc|b)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+ (`(a*)(b|abc)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+ (`(a*)(abc|b)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3)
+ (`(a|ab)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(a|ab)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(ab|a)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ (`(ab|a)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4)
+ // TODO: whole-expression alternation
+ // (`ab|cd`, "abc", matchres::MATCH, 0, -1),
+ // (`ab|cd`, "abcd", matchres::MATCH, 0, -1),
+ // TODO: multiple alternation
+ // (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1),
+ // (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1),
+ // TODO: nested capture groups
+ // (`((a))`, "abc", matchres::MATCH, 0, -1),
+ // (`((a)(b)c)(d)`, "abcd", matchres::MATCH, 0, -1),
+ // (`(bc+d$|ef*g.|h?i(j|k))`, "effgz", matchres::MATCH, 0, -1),
+ // (`(bc+d$|ef*g.|h?i(j|k))`, "ij", matchres::MATCH, 0, -1),
+ // (`(bc+d$|ef*g.|h?i(j|k))`, "effg", matchres::NOMATCH, 0, 0),
+ // (`(bc+d$|ef*g.|h?i(j|k))`, "bcdd", matchres::NOMATCH, 0, 0),
+ // (`(bc+d$|ef*g.|h?i(j|k))`, "reffgz", matchres::MATCH, 0, -1),
+ // (`((((((((((a))))))))))`, "a", matchres::MATCH, 0, -1),
+ // (`(((((((((a)))))))))`, "a", matchres::MATCH, 0, -1),
+ // (`(([a-z]+):)?([a-z]+)$`, "smil", matchres::MATCH, 0, -1),
+ // (`^((a)c)?(ab)$`, "ab", matchres::MATCH, 0, -1),
+ // TODO: multiple simultaneous capture groups
+ // (`(a+|b)*`, "ab", matchres::MATCH, 0, -1),
+ // (`(a+|b){0,}`, "ab", matchres::MATCH, 0, -1),
+ // (`(a+|b)+`, "ab", matchres::MATCH, 0, -1),
+ // (`(a+|b){1,}`, "ab", matchres::MATCH, 0, -1),
+ // (`(a+|b)?`, "ab", matchres::MATCH, 0, -1),
+ // (`(a+|b){0,1}`, "ab", matchres::MATCH, 0, -1),
+ // NOTE: character sequences not currently supported
+ // (`\0`, "\0", matchres::MATCH, 0, -1),
+ // (`[\0a]`, "\0", matchres::MATCH, 0, -1),
+ // (`[a\0]`, "\0", matchres::MATCH, 0, -1),
+ // (`[^a\0]`, "\0", matchres::NOMATCH, 0, 0),
+ // NOTE: octal sequences not currently supported
+ // (`[\1]`, "\1", matchres::MATCH, 0, -1),
+ // (`\09`, "\0(separate-me)9", matchres::MATCH, 0, -1),
+ // (`\141`, "a", matchres::MATCH, 0, -1),
+ // (`[\41]`, "!", matchres::MATCH, 0, -1),
+ // NOTE: hex sequences not currently supported
+ // (`\xff`, "\377", matchres::MATCH, 0, -1),
+ // NOTE: non-greedy matching not currently supported
+ // (`a.+?c`, "abcabc", matchres::MATCH, 0, -1),
+ // (`.*?\S *:`, "xx:", matchres::MATCH, 0, -1),
+ // (`a[ ]*?\ (\d+).*`, "a 10", matchres::MATCH, 0, -1),
+ // (`a[ ]*?\ (\d+).*`, "a 10", matchres::MATCH, 0, -1),
+ // (`"(\\"|[^"])*?"`, `"\""`, matchres::MATCH, 0, -1),
+ // (`^.*?$`, "one\ntwo\nthree\n", matchres::NOMATCH, 0, 0),
+ // (`a[^>]*?b`, "a>b", matchres::NOMATCH, 0, 0),
+ // (`^a*?$`, "foo", matchres::NOMATCH, 0, 0),
+ // (`^([ab]*?)(?=(b)?)c`, "abc", matchres::MATCH, 0, -1),
+ // (`^([ab]*?)(?!(b))c`, "abc", matchres::MATCH, 0, -1),
+ // (`^([ab]*?)(?<!(a))c`, "abc", matchres::MATCH, 0, -1),
+ ];
+
+ for (let i = 0z; i < len(cases); i += 1) {
+ const expr = cases[i].0;
+ const string = cases[i].1;
+ const should_match = cases[i].2;
+ const start = cases[i].3;
+ const end = if (cases[i].4 == -1) {
+ yield len(string): int;
+ } else {
+ yield cases[i].4;
+ };
+ run_find_case(expr, string, should_match, start, end);
+ };
+
+ if (PERFTEST_ON) {
+ const f = os::open("compress/zlib/data+test.ha")!;
+ const buf: [2000000]u8 = [0...];
+ io::read(f, buf)!;
+ const test_data = strings::fromutf8(buf);
+
+ const perf_cases = [
+ (
+ `^a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*aaaaaaaaaaaaaaaa$`,
+ "aaaaaaaaaaaaaaaa",
+ matchres::MATCH,
+ 0, -1
+ ),
+ (`.*.*.*rand_out`, test_data, matchres::MATCH, 0, 1211255),
+ (`0x`, test_data, matchres::MATCH, 86, 88),
+ ];
+
+ for (let i = 0z; i < len(perf_cases); i += 1) {
+ const expr = perf_cases[i].0;
+ const string = perf_cases[i].1;
+ const should_match = perf_cases[i].2;
+ const start = perf_cases[i].3;
+ const end = if (perf_cases[i].4 == -1) {
+ yield len(string): int;
+ } else {
+ yield perf_cases[i].4;
+ };
+ run_find_case(expr, string, should_match, start, end);
+ };
+ };
+};
+
+@test fn findall() void = {
+ const cases = [
+ (`ab.`, "hello abc and abz test abq thanks", matchres::MATCH, 3),
+ ];
+
+ for (let i = 0z; i < len(cases); i += 1) {
+ const expr = cases[i].0;
+ const string = cases[i].1;
+ const should_match = cases[i].2;
+ const count = cases[i].3;
+ run_findall_case(expr, string, should_match, count);
+ };
+
+ if (PERFTEST_ON) {
+ const f = os::open("compress/zlib/data+test.ha")!;
+ const buf: [2000000]u8 = [0...];
+ io::read(f, buf)!;
+ const test_data = strings::fromutf8(buf);
+
+ const perf_cases = [
+ (`0x`, test_data, matchres::MATCH, 199053),
+ ];
+
+ for (let i = 0z; i < len(perf_cases); i += 1) {
+ const expr = perf_cases[i].0;
+ const string = perf_cases[i].1;
+ const should_match = perf_cases[i].2;
+ const count = perf_cases[i].3;
+ run_findall_case(expr, string, should_match, count);
+ };
+ };
+};
diff --git a/regex/README b/regex/README
@@ -0,0 +1,41 @@
+This is a NFA-based regex implementation. It closely adheres to the POSIX
+Extended Regular Expressions specification [0]. Matching is guaranteed to run in
+linear time, and various optimisations have been implemented to ensure good
+performance on most inputs.
+
+By default, matches will be found anywhere in the given string. The ^ and $
+characters can be used to anchor the match to the beginning or end of the
+string.
+
+find() returns a slice of [[regex::matchgroup]]s for the first match. The
+first [[regex::matchgroup]] represents the entire match, while the rest
+represent the submatches, specified in the expression using (parens).
+
+findall() finds all non-overlapping matches in the given string and returns
+a slice of slices of [[regex::matchgroup]]s.
+
+This module implements the POSIX match disambiguation rules by returning
+the longest match among the leftmost matches.
+
+ const re = regex::compile(`[Hh]are`)!;
+ defer regex::regex_free(re);
+
+ const first_match = regex::find(re, "Hello Hare, hello Hare.")!;
+ match (first_match) {
+ case void => void;
+ case let groups: []regex::matchgroup =>
+ defer free(groups);
+ // The match groups provide the content, start index and end index of
+ // the main match, as well as all submatches.
+ };
+
+ const all_matches = regex::findall(re, "Hello hare, hello hare.")!;
+ match (all_matches) {
+ case void => void;
+ case let groupsets: [][]regex::matchgroup =>
+ defer regex::freeall(groupsets);
+ // A slice of multiple match group sets, which can be used similarly
+ // to the find() example.
+ };
+
+[0]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04
diff --git a/regex/regex.ha b/regex/regex.ha
@@ -0,0 +1,813 @@
+// License: MPL-2.0
+// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net>
+use ascii;
+use encoding::utf8;
+use errors;
+use strconv;
+use strings;
+
+// A string describing the error the occurred.
+export type error = !str;
+
+export type match_type = enum { ANCHORED, FLOATING };
+export type inst_lit = rune,
+ inst_charset = struct { idx: size, is_positive: bool },
+ inst_any = void,
+ inst_split = size,
+ inst_jump = size,
+ inst_skip = void,
+ inst_match = match_type,
+ inst_groupstart = void,
+ inst_groupend = void,
+ inst_repeat = struct {
+ id: size,
+ origin: size,
+ min: (void | size),
+ max: (void | size),
+ };
+export type inst = (inst_lit | inst_any | inst_split | inst_jump |
+ inst_skip | inst_match | inst_charset |
+ inst_groupstart | inst_groupend |
+ inst_repeat);
+
+// A (sub)match found as a result of matching a certain string against a regex.
+export type matchgroup = struct {
+ content: str,
+ start: size,
+ end: size,
+};
+
+type thread = struct {
+ pc: size,
+ start_idx: size,
+ root_group: matchgroup,
+ groups: []matchgroup,
+ curr_group: matchgroup,
+ curr_group_inited: bool,
+ rep_counters: []size,
+ matched: bool,
+ failed: bool,
+};
+
+type newmatch = void;
+
+export type charclass = enum {
+ ALNUM, ALPHA, ASCII, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT,
+ SPACE, UPPER, WORD, XDIGIT,
+};
+export type charset = [](charset_lit_item | charset_range_item |
+ charset_class_item),
+ charset_lit_item = rune,
+ charset_range_item = (u8, u8),
+ charset_class_item = charclass;
+const charclass_names: [](charclass, str) = [
+ (charclass::ALNUM, ":alnum:]"),
+ (charclass::ALPHA, ":alpha:]"),
+ (charclass::ASCII, ":ascii:]"),
+ (charclass::BLANK, ":blank:]"),
+ (charclass::CNTRL, ":cntrl:]"),
+ (charclass::DIGIT, ":digit:]"),
+ (charclass::GRAPH, ":graph:]"),
+ (charclass::LOWER, ":lower:]"),
+ (charclass::PRINT, ":print:]"),
+ (charclass::PUNCT, ":punct:]"),
+ (charclass::SPACE, ":space:]"),
+ (charclass::UPPER, ":upper:]"),
+ (charclass::WORD, ":word:]"),
+ (charclass::XDIGIT, ":xdigit:]"),
+];
+const charclass_fns: [](charclass, *fn(c: rune) bool) = [
+ (charclass::ALNUM, &ascii::isalnum),
+ (charclass::ALPHA, &ascii::isalpha),
+ (charclass::ASCII, &ascii::isascii),
+ (charclass::BLANK, &ascii::isblank),
+ (charclass::CNTRL, &ascii::iscntrl),
+ (charclass::DIGIT, &ascii::isdigit),
+ (charclass::GRAPH, &ascii::isgraph),
+ (charclass::LOWER, &ascii::islower),
+ (charclass::PRINT, &ascii::isprint),
+ (charclass::PUNCT, &ascii::ispunct),
+ (charclass::SPACE, &ascii::isspace),
+ (charclass::UPPER, &ascii::isupper),
+ (charclass::WORD, &isword),
+ (charclass::XDIGIT, &ascii::isxdigit),
+];
+const multibyte_err: error = "Character ranges do not support characters larger than one byte.";
+
+export type regex = struct {
+ insts: []inst,
+ charsets: []charset,
+ n_reps: size,
+};
+
+// Frees the memory used by a regex.
+export fn regex_free(re: regex) void = {
+ free(re.insts);
+ for (let i = 0z; i < len(re.charsets); i += 1) {
+ free(re.charsets[i]);
+ };
+ free(re.charsets);
+};
+
+fn find_last_groupstart(insts: *[]inst) (size | error) = {
+ for (let i = len(insts); i > 0; i -= 1) {
+ if (insts[i - 1] is inst_groupstart) {
+ return i - 1;
+ };
+ };
+ return `Encountered ")" token without matching "("`: error;
+};
+
+fn isword(c: rune) bool = ascii::isalnum(c) || c == '_';
+
+fn handle_bracket(
+ insts: *[]inst,
+ r: rune,
+ r_idx: *size,
+ bracket_idx: *int,
+ iter: *strings::iterator,
+ charsets: *[]charset,
+ skip_charclass_rest: *bool,
+ is_charset_positive: *bool,
+ in_bracket: *bool
+) (void | error) = {
+ const peek1 = strings::next(iter);
+ const peek2 = strings::next(iter);
+ const peek3 = strings::next(iter);
+ if (!(peek1 is void)) {
+ strings::prev(iter);
+ };
+ if (!(peek2 is void)) {
+ strings::prev(iter);
+ };
+ if (!(peek3 is void)) {
+ strings::prev(iter);
+ };
+
+ if (*bracket_idx == -1) {
+ append(charsets, alloc([]));
+ };
+ *bracket_idx += 1;
+
+ if (*skip_charclass_rest) {
+ if (r == ']') {
+ *skip_charclass_rest = false;
+ };
+ *r_idx += 1;
+ return;
+ };
+
+ const is_range = peek1 is rune && peek1 as rune == '-' &&
+ !(peek2 is void) && !(peek3 is void);
+ const range_end = peek2;
+ const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 &&
+ !*is_charset_positive;
+ if (r == ']' && !is_first_char) {
+ const newinst = inst_charset {
+ idx = len(charsets) - 1,
+ is_positive = *is_charset_positive,
+ };
+ append(insts, newinst);
+ *in_bracket = false;
+ *bracket_idx = -1;
+ *is_charset_positive = true;
+ } else if (r == '^' && *bracket_idx == 0) {
+ *is_charset_positive = false;
+ } else if (r == '[' && !(peek1 is void) &&
+ peek1 as rune == ':') {
+ const rest = strings::iterstr(iter);
+ const n_cc = len(charclass_names);
+ for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
+ const cc = charclass_names[cc_idx];
+ if (strings::hasprefix(rest, cc.1)) {
+ append(charsets[len(charsets) - 1],
+ cc.0: charset_class_item);
+ *skip_charclass_rest = true;
+ break;
+ };
+ };
+ if (!*skip_charclass_rest) {
+ return `Found "[:" in bracket expression and expected a character class such as [:alpha:], but none was found. If you did not mean to use a charclass, try ":["`: error;
+ };
+ } else if (is_range) {
+ const start_enc = utf8::encoderune(r);
+ if (len(start_enc) > 1) {
+ return multibyte_err;
+ };
+ const start_b = start_enc[0];
+
+ const end_enc = utf8::encoderune(range_end as rune);
+ if (len(end_enc) > 1) {
+ return multibyte_err;
+ };
+ const end_b = end_enc[0];
+
+ if (end_b < start_b) {
+ return `Found range in bracket expression where end character was before start character, e.g. "[b-a]"`: error;
+ };
+
+ append(charsets[len(charsets) - 1],
+ (start_b, end_b): charset_range_item);
+ strings::next(iter);
+ strings::next(iter);
+ *r_idx += 2;
+ } else {
+ append(charsets[len(charsets) - 1],
+ r: charset_lit_item);
+ };
+
+ *r_idx += 1;
+};
+
+// Compiles a string containing a regular expression into a regex struct.
+export fn compile(expr: str) (regex | error) = {
+ let insts: []inst = alloc([]);
+ let charsets: []charset = alloc([]);
+ let iter = strings::iter(expr);
+ let r_idx = 0z;
+ let match_type = match_type::FLOATING;
+ let curr_alt_jump_idx = -1;
+ let in_bracket = false;
+ let skip_charclass_rest = false;
+ let bracket_idx = -1;
+ let is_charset_positive = true;
+ let n_reps = 0z;
+ let n_groupstarts = 0;
+
+ for (true) {
+ const next = strings::next(&iter);
+
+ if (r_idx == 0 && next is rune && next: rune != '^') {
+ append(insts, void: inst_skip);
+ };
+
+ if (in_bracket) {
+ if (next is void) {
+ return `Found unterminated bracket expression, are you missing a closing "]"?`: error;
+ };
+ const r = next: rune;
+ handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter,
+ &charsets, &skip_charclass_rest,
+ &is_charset_positive,
+ &in_bracket)?;
+ continue;
+ };
+
+ const r = match (next) {
+ case void =>
+ if (n_groupstarts > 0) {
+ return "Expression ended, but there were still unclosed groups": error;
+ };
+ break;
+ case let r: rune => yield r;
+ };
+ switch (r) {
+ case '\\' =>
+ const peek1 = strings::next(&iter);
+ if (peek1 is void) {
+ return "Found an escaping backslash, but there was nothing to escape": error;
+ } else {
+ append(insts, (peek1 as rune): inst_lit);
+ r_idx += 1;
+ };
+ case '^' =>
+ if (r_idx != 0) {
+ return `Anchor character "^" may only occur at the start of the expression`: error;
+ };
+ case '$' =>
+ if (r_idx != len(expr) - 1) {
+ return `Anchor character "$" may only occur at the end of the expression`: error;
+ };
+ match_type = match_type::ANCHORED;
+ case '[' =>
+ in_bracket = true;
+ case ']' =>
+ if (in_bracket) {
+ in_bracket = false;
+ } else {
+ append(insts, r: inst_lit);
+ };
+ case '(' =>
+ append(insts, void: inst_groupstart);
+ n_groupstarts += 1;
+ case ')' =>
+ if (n_groupstarts == 0) {
+ return "Tried to close group but none was open": error;
+ };
+ n_groupstarts -= 1;
+ append(insts, void: inst_groupend);
+ if (curr_alt_jump_idx != -1) {
+ assert(insts[curr_alt_jump_idx] is inst_jump);
+ insts[curr_alt_jump_idx] =
+ (len(insts) - 1): inst_jump;
+ curr_alt_jump_idx = -1;
+ };
+ case '|' =>
+ append(insts, 9999999: inst_jump);
+ const origin = find_last_groupstart(&insts)? + 1;
+ const newinst = (len(insts) + 1): inst_split;
+ insert(insts[origin], newinst);
+ curr_alt_jump_idx = (len(insts) - 1): int;
+ case '{' =>
+ let origin = len(insts) - 1;
+ if (insts[origin] is inst_groupend) {
+ origin = find_last_groupstart(&insts)?;
+ };
+ const rest = strings::iterstr(&iter);
+ const rep_parts = parse_repetition(rest)?;
+ const can_skip = rep_parts.0 == 0;
+ const min = if (rep_parts.0 == 0) {
+ yield 1z;
+ } else {
+ yield rep_parts.0;
+ };
+ if (can_skip) {
+ insert(insts[origin],
+ len(insts) + 2: inst_split);
+ origin += 1;
+ };
+ const newinst = inst_repeat {
+ id = n_reps,
+ origin = origin,
+ min = min,
+ max = rep_parts.1,
+ };
+ for (let i = 0z; i <= rep_parts.2; i += 1) {
+ strings::next(&iter);
+ r_idx += 1;
+ };
+ append(insts, newinst);
+ n_reps += 1;
+ case '?' =>
+ if (r_idx == 0 || len(insts) == 0) {
+ return `Found "?" but there was nothing before it`: error;
+ };
+ let term_start_idx = len(insts) - 1;
+ match (insts[term_start_idx]) {
+ case (inst_lit | inst_charset | inst_any) => void;
+ case inst_groupend =>
+ term_start_idx = find_last_groupstart(&insts)?;
+ case inst_groupstart =>
+ return `Found "?" but it was in an empty group`: error;
+ case =>
+ return `Invalid use of "?"`: error;
+ };
+ const after_idx = len(insts) + 1;
+ insert(insts[term_start_idx], after_idx: inst_split);
+ case '*' =>
+ if (r_idx == 0 || len(insts) == 0) {
+ return `Found "*" but there was nothing before it`: error;
+ };
+ const new_inst_offset = 1z;
+ const jump_idx = len(insts) + new_inst_offset;
+ const after_idx = jump_idx + 1z;
+ let term_start_idx = len(insts) - 1z;
+ match (insts[term_start_idx]) {
+ case (inst_lit | inst_charset | inst_any) => void;
+ case inst_groupend =>
+ term_start_idx = find_last_groupstart(&insts)?;
+ case inst_groupstart =>
+ return `Found "*" but it was in an empty group`: error;
+ case =>
+ return `Invalid use of "*"`: error;
+ };
+ const split_idx = term_start_idx;
+ term_start_idx += new_inst_offset;
+ insert(insts[split_idx], after_idx: inst_split);
+ append(insts, split_idx: inst_jump);
+ case '+' =>
+ if (r_idx == 0 || len(insts) == 0) {
+ return `Found "+" but there was nothing before it`: error;
+ };
+ let term_start_idx = len(insts) - 1;
+ match (insts[term_start_idx]) {
+ case (inst_lit | inst_charset | inst_any) => void;
+ case inst_groupend =>
+ term_start_idx = find_last_groupstart(&insts)?;
+ case inst_groupstart =>
+ return `Found "+" but it was in an empty group`: error;
+ case =>
+ return `Invalid use of "+"`: error;
+ };
+ append(insts, term_start_idx: inst_split);
+ case '.' =>
+ append(insts, void: inst_any);
+ case =>
+ append(insts, r: inst_lit);
+ };
+ r_idx += 1;
+ };
+
+ append(insts, match_type: inst_match);
+
+ return regex {
+ insts = insts,
+ charsets = charsets,
+ n_reps = n_reps,
+ };
+};
+
+fn parse_repetition(
+ s: str
+) (((void | size), (void | size), size) | error) = {
+ const first_comma = strings::index(s, ",");
+ const first_endbrace = strings::index(s, "}");
+ if (first_endbrace is void) {
+ return "Invalid repetition value": error;
+ };
+ const first_endbrace = first_endbrace as size;
+
+ let min_str = "";
+ let max_str = "";
+ let is_single_arg = false;
+ if (first_comma is void || first_endbrace < first_comma as size) {
+ const cut = strings::cut(s, "}");
+ min_str = cut.0;
+ max_str = cut.0;
+ is_single_arg = true;
+ } else {
+ const cut = strings::cut(s, ",");
+ min_str = cut.0;
+ max_str = strings::cut(cut.1, "}").0;
+ };
+
+ let min: (void | size) = void;
+ let max: (void | size) = void;
+
+ if (len(min_str) > 0) {
+ min = match (strconv::stoi(min_str)) {
+ case let res: int =>
+ yield if (res < 0) {
+ return `Only positive integers are allowed inside "{}"`: error;
+ } else {
+ yield res: size;
+ };
+ case => return "Invalid repetition minimum value": error;
+ };
+ };
+
+ if (len(max_str) > 0) {
+ max = match (strconv::stoi(max_str)) {
+ case let res: int =>
+ yield if (res < 0) {
+ return `Only positive integers are allowed inside "{}"`: error;
+ } else {
+ yield res: size;
+ };
+ case => return "Invalid repetition maximum value": error;
+ };
+ };
+
+ const rep_len = if (is_single_arg) {
+ yield len(min_str);
+ } else {
+ yield len(min_str) + 1 + len(max_str);
+ };
+ return (min, max, rep_len);
+};
+
+fn delete_thread(i: size, threads: *[]thread) void = {
+ free(threads[i].groups);
+ free(threads[i].rep_counters);
+ delete(threads[i]);
+};
+
+fn is_consuming_inst(a: inst) bool = {
+ return a is (inst_lit | inst_any | inst_charset);
+};
+
+fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = {
+ // Do not add this thread if there is already another thread with
+ // the same PC
+ for (let i = 0z; i < len(threads); i += 1) {
+ if (threads[i].pc == new_pc &&
+ !threads[i].matched &&
+ threads[i].start_idx <
+ threads[parent_idx].start_idx) {
+ return;
+ };
+ };
+
+ append(threads, thread {
+ pc = new_pc,
+ start_idx = threads[parent_idx].start_idx,
+ curr_group = threads[parent_idx].curr_group,
+ curr_group_inited = threads[parent_idx].curr_group_inited,
+ matched = threads[parent_idx].matched,
+ failed = threads[parent_idx].failed,
+ groups = alloc(threads[parent_idx].groups...),
+ rep_counters = alloc(threads[parent_idx].rep_counters...),
+ ...
+ });
+};
+
+fn run_thread(
+ i: size,
+ re: regex,
+ string: str,
+ threads: *[]thread,
+ r_or_end: (rune | void),
+ str_idx: int
+) (void | error | newmatch) = {
+ if (threads[i].matched) {
+ return;
+ };
+ for (!is_consuming_inst(re.insts[threads[i].pc])) {
+ match (re.insts[threads[i].pc]) {
+ case inst_lit => abort();
+ case inst_any => abort();
+ case inst_split =>
+ const new_pc = re.insts[threads[i].pc]: inst_split: size;
+ add_thread(threads, i, new_pc);
+ threads[i].pc += 1;
+ case inst_jump =>
+ threads[i].pc = re.insts[threads[i].pc]: inst_jump: size;
+ case inst_skip =>
+ const new_pc = threads[i].pc + 1;
+ threads[i].start_idx = str_idx: size;
+ add_thread(threads, i, new_pc);
+ break;
+ case inst_match =>
+ // Do not match if we need an end-anchored match, but we
+ // have not exhausted our string
+ const mt = re.insts[threads[i].pc]: inst_match: match_type;
+ if (mt == match_type::ANCHORED && !(r_or_end is void)) {
+ threads[i].failed = true;
+ return;
+ };
+ threads[i].root_group = matchgroup {
+ start = threads[i].start_idx,
+ end = str_idx: size,
+ // TODO: This is a perf issue for large strings
+ content = strings::sub(string,
+ threads[i].start_idx,
+ str_idx: size),
+ };
+ threads[i].matched = true;
+ return newmatch;
+ case inst_groupstart =>
+ if (threads[i].curr_group_inited) {
+ return "Found nested capture groups in expression, which are not supported": error;
+ };
+ threads[i].curr_group.start = str_idx: size;
+ threads[i].curr_group_inited = true;
+ threads[i].pc += 1;
+ case inst_groupend =>
+ if (!threads[i].curr_group_inited) {
+ return `Found a groupend token ")" without having previously seen a groupstart token "("`: error;
+ };
+ threads[i].curr_group.end = str_idx: size;
+ // TODO: This is a perf issue for large strings
+ threads[i].curr_group.content = strings::sub(string,
+ threads[i].curr_group.start,
+ threads[i].curr_group.end);
+ append(threads[i].groups, threads[i].curr_group);
+ threads[i].curr_group = matchgroup { ... };
+ threads[i].curr_group_inited = false;
+ threads[i].pc += 1;
+ case let ir: inst_repeat =>
+ assert(ir.id < len(threads[i].rep_counters));
+ threads[i].rep_counters[ir.id] += 1;
+ if (ir.max is size &&
+ threads[i].rep_counters[ir.id] >
+ ir.max as size) {
+ threads[i].failed = true;
+ return;
+ };
+ const new_pc = threads[i].pc + 1;
+ threads[i].pc = ir.origin;
+ if (ir.min is void ||
+ threads[i].rep_counters[ir.id] >=
+ ir.min as size) {
+ add_thread(threads, i, new_pc);
+ };
+ };
+ };
+
+ // From now on, we're only matching consuming instructions, and these
+ // can't do anything without another rune.
+ if (r_or_end is void) {
+ threads[i].failed = true;
+ return;
+ };
+
+ const r = r_or_end as rune;
+
+ match (re.insts[threads[i].pc]) {
+ case inst_skip => return;
+ case let lit: inst_lit =>
+ if (r != lit) {
+ threads[i].failed = true;
+ };
+ case inst_any => void;
+ case let cs: inst_charset =>
+ const charset = re.charsets[cs.idx];
+ // Disprove the match if we're looking for a negative match
+ // Prove the match if we're looking for a positive match
+ let matched = !cs.is_positive;
+ for (let i = 0z; i < len(charset); i += 1) match (charset[i]) {
+ case let lit: charset_lit_item =>
+ if (r == lit) {
+ // Succeeded if positive match
+ // Failed if negative match
+ matched = cs.is_positive;
+ break;
+ };
+ case let range: charset_range_item =>
+ const r_enc = utf8::encoderune(r);
+ if (len(r_enc) > 1) {
+ return multibyte_err;
+ };
+ const r_b = r_enc[0];
+ if (r_b >= range.0 && r_b <= range.1) {
+ // Succeeded if positive match
+ // Failed if negative match
+ matched = cs.is_positive;
+ break;
+ };
+ case let class: charset_class_item =>
+ const n_cc = len(charclass_fns);
+ for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) {
+ const cc = charclass_fns[cc_idx];
+ if (cc.0 == class: charclass && cc.1(r)) {
+ // Succeeded if positive match
+ // Failed if negative match
+ matched = cs.is_positive;
+ break;
+ };
+ };
+ };
+ if (!matched) {
+ threads[i].failed = true;
+ };
+ };
+
+ threads[i].pc += 1;
+};
+
+// Attempts to match a regular expression against a string and returns the
+// either the longest leftmost match or all matches.
+fn search(
+ re: regex,
+ string: str,
+ str_iter: *strings::iterator,
+ str_idx: *int
+) (void | []matchgroup | error) = {
+ let threads: []thread = alloc([
+ thread { groups = alloc([]), ... }
+ ]);
+ if (re.n_reps > 0) {
+ threads[0].rep_counters = alloc([0...], re.n_reps);
+ };
+ defer {
+ for (let i = 0z; i < len(threads); i += 1) {
+ free(threads[i].groups);
+ free(threads[i].rep_counters);
+ };
+ free(threads);
+ };
+
+ let first_match_idx: (void | size) = void;
+
+ for (true) {
+ if (len(threads) == 0) {
+ return void;
+ };
+
+ let all_matched = true;
+ for (let i = 0z; i < len(threads); i += 1) {
+ if (!threads[i].matched) {
+ all_matched = false;
+ break;
+ };
+ };
+
+ if (all_matched) {
+ let best_len = 0z;
+ let best_n_groups = 0z;
+ let best_idx = 0z;
+ for (let i = 0z; i < len(threads); i += 1) {
+ let match_len = threads[i].root_group.end -
+ threads[i].root_group.start;
+ const is_better = match_len > best_len ||
+ match_len == best_len &&
+ len(threads[i].groups) > best_n_groups;
+ if (is_better) {
+ best_len = match_len;
+ best_idx = i;
+ best_n_groups = len(threads[i].groups);
+ };
+ };
+ let res: []matchgroup = alloc([],
+ len(threads[best_idx].groups) + 1);
+ append(res, threads[best_idx].root_group);
+ append(res, threads[best_idx].groups...);
+ return res;
+ };
+
+ const r_or_end = strings::next(str_iter);
+ *str_idx += 1;
+
+ for (let i = 0z; i < len(threads); i += 1) {
+ const res = run_thread(i, re, string, &threads,
+ r_or_end, *str_idx)?;
+ const matchlen = threads[i].root_group.end -
+ threads[i].root_group.start;
+ const is_better = res is newmatch && matchlen > 0 &&
+ (first_match_idx is void ||
+ threads[i].start_idx <
+ first_match_idx as size);
+ if (is_better) {
+ first_match_idx = threads[i].start_idx;
+ };
+ };
+
+ // When we only want the leftmost match, delete all threads that
+ // start after the earliest non-zero-length matched thread
+ if (first_match_idx is size) {
+ for (let i = 0z; i < len(threads); i += 1) {
+ if (threads[i].start_idx >
+ first_match_idx as size) {
+ threads[i].failed = true;
+ };
+ };
+ };
+
+ // Delete threads that have a PC that has already been
+ // encountered in previous threads. Prioritise threads that
+ // have an earlier start_idx, and threads that were added
+ // earlier.
+ for (let i = 0i64; i < len(threads): i64 - 1; i += 1) {
+ for (let j = i + 1; j < len(threads): i64; j += 1) {
+ const same_pc = threads[i].pc == threads[j].pc;
+ const none_matched = !threads[j].matched &&
+ !threads[i].matched;
+ if (same_pc && none_matched) {
+ if (threads[i].start_idx <=
+ threads[j].start_idx) {
+ delete_thread(j: size, &threads);
+ j -= 1;
+ } else {
+ delete_thread(i: size, &threads);
+ i -= 1;
+ break;
+ };
+ };
+ };
+ };
+
+ for (let i = 0z; i < len(threads); i += 1) {
+ if (threads[i].failed) {
+ delete_thread(i, &threads);
+ i -= 1;
+ };
+ };
+ };
+
+ return void;
+};
+
+
+// Attempts to match a regular expression against a string and returns the
+// either the longest leftmost match or all matches.
+export fn find(re: regex, string: str) (void | []matchgroup | error) = {
+ let str_idx = -1;
+ let str_iter = strings::iter(string);
+ return search(re, string, &str_iter, &str_idx);
+};
+
+// Attempts to match a regular expression against a string and returns all
+// non-overlapping matches.
+export fn findall(re: regex, string: str) (void | [][]matchgroup | error) = {
+ let res: [][]matchgroup = alloc([]);
+ let str_idx = -1;
+ let str_iter = strings::iter(string);
+ for (true) {
+ const findres = search(re, string, &str_iter, &str_idx)?;
+ match (findres) {
+ case let m: []matchgroup =>
+ append(res, m);
+ assert(str_idx: size >= m[0].end);
+ for (str_idx: size > m[0].end) {
+ strings::prev(&str_iter);
+ str_idx -= 1;
+ };
+ if (str_idx: size >= len(string)) {
+ break;
+ };
+ case void => break;
+ };
+ };
+ if (len(res) == 0) {
+ return void;
+ };
+ return res;
+};
+
+// Frees all the matches in a slice and the slice itself.
+export fn freeall(s: [][]matchgroup) void = {
+ for (let i = 0z; i < len(s); i += 1) {
+ free(s[i]);
+ };
+ free(s);
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -1025,6 +1025,16 @@ path() {
gen_ssa path strings bufio bytes io
}
+regex() {
+ if [ $testing -eq 0 ]; then
+ gen_srcs regex regex.ha
+ gen_ssa regex encoding::utf8 errors strconv strings
+ else
+ gen_srcs regex regex.ha +test.ha
+ gen_ssa regex encoding::utf8 errors strconv strings fmt io os
+ fi
+}
+
gensrcs_strconv() {
gen_srcs strconv \
types.ha \
@@ -1298,6 +1308,7 @@ net::uri
os linux freebsd
os::exec linux freebsd
path
+regex
shlex
slices
sort
diff --git a/stdlib.mk b/stdlib.mk
@@ -548,6 +548,12 @@ stdlib_deps_any+=$(stdlib_path_any)
stdlib_path_linux=$(stdlib_path_any)
stdlib_path_freebsd=$(stdlib_path_any)
+# gen_lib regex (any)
+stdlib_regex_any=$(HARECACHE)/regex/regex-any.o
+stdlib_deps_any+=$(stdlib_regex_any)
+stdlib_regex_linux=$(stdlib_regex_any)
+stdlib_regex_freebsd=$(stdlib_regex_any)
+
# gen_lib shlex (any)
stdlib_shlex_any=$(HARECACHE)/shlex/shlex-any.o
stdlib_deps_any+=$(stdlib_shlex_any)
@@ -1612,6 +1618,16 @@ $(HARECACHE)/path/path-any.ssa: $(stdlib_path_any_srcs) $(stdlib_rt) $(stdlib_st
@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Npath \
-t$(HARECACHE)/path/path.td $(stdlib_path_any_srcs)
+# regex (+any)
+stdlib_regex_any_srcs= \
+ $(STDLIB)/regex/regex.ha
+
+$(HARECACHE)/regex/regex-any.ssa: $(stdlib_regex_any_srcs) $(stdlib_rt) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_strconv_$(PLATFORM)) $(stdlib_strings_$(PLATFORM))
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(HARECACHE)/regex
+ @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nregex \
+ -t$(HARECACHE)/regex/regex.td $(stdlib_regex_any_srcs)
+
# shlex (+any)
stdlib_shlex_any_srcs= \
$(STDLIB)/shlex/split.ha
@@ -2429,6 +2445,12 @@ testlib_deps_any+=$(testlib_path_any)
testlib_path_linux=$(testlib_path_any)
testlib_path_freebsd=$(testlib_path_any)
+# gen_lib regex (any)
+testlib_regex_any=$(TESTCACHE)/regex/regex-any.o
+testlib_deps_any+=$(testlib_regex_any)
+testlib_regex_linux=$(testlib_regex_any)
+testlib_regex_freebsd=$(testlib_regex_any)
+
# gen_lib shlex (any)
testlib_shlex_any=$(TESTCACHE)/shlex/shlex-any.o
testlib_deps_any+=$(testlib_shlex_any)
@@ -3533,6 +3555,17 @@ $(TESTCACHE)/path/path-any.ssa: $(testlib_path_any_srcs) $(testlib_rt) $(testlib
@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Npath \
-t$(TESTCACHE)/path/path.td $(testlib_path_any_srcs)
+# regex (+any)
+testlib_regex_any_srcs= \
+ $(STDLIB)/regex/regex.ha \
+ $(STDLIB)/regex/+test.ha
+
+$(TESTCACHE)/regex/regex-any.ssa: $(testlib_regex_any_srcs) $(testlib_rt) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_strconv_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_os_$(PLATFORM))
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(TESTCACHE)/regex
+ @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nregex \
+ -t$(TESTCACHE)/regex/regex.td $(testlib_regex_any_srcs)
+
# shlex (+any)
testlib_shlex_any_srcs= \
$(STDLIB)/shlex/split.ha \