hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 79e76b127206608c7c55f6b791bcd9844c9bdcdd
parent b3d613259690b701f50e2a03384d59304bc16bd7
Author: Vlad-Stefan Harbuz <vlad@vladh.net>
Date:   Fri,  8 Apr 2022 14:03:42 +0200

add regex

Signed-off-by: Vlad-Stefan Harbuz <vlad@vladh.net>

Diffstat:
Aregex/+test.ha | 598+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aregex/README | 41+++++++++++++++++++++++++++++++++++++++++
Aregex/regex.ha | 813+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mscripts/gen-stdlib | 11+++++++++++
Mstdlib.mk | 33+++++++++++++++++++++++++++++++++
5 files changed, 1496 insertions(+), 0 deletions(-)

diff --git a/regex/+test.ha b/regex/+test.ha @@ -0,0 +1,598 @@ +// License: MPL-2.0 +// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net> +use fmt; +use io; +use os; +use strings; + +def PERFTEST_ON: bool = false; +type matchres = enum { MATCH, NOMATCH, ERROR }; + +fn run_find_case( + expr: str, + string: str, + expected: matchres, + start: int, + end: int +) void = { + const re = match (compile(expr)) { + case let re: regex => yield re; + case let e: error => + if (expected == matchres::MATCH) { + fmt::println(e)!; + fmt::fatal("Expected expression /{}/ to match, but it errored", + expr, string); + }; + if (expected == matchres::NOMATCH) { + fmt::println(e)!; + fmt::fatal("Expected expression /{}/ to not match, but it errored", + expr, string); + }; + return; + }; + + match (find(re, string)) { + case void => + if (expected == matchres::MATCH) { + fmt::fatal("Expected expression /{}/ to match string \"{}\", but it did not", + expr, string); + }; + if (expected == matchres::ERROR) { + fmt::fatal("Expression /{}/ failed to match, but should have errored", + expr, string); + }; + + case let m: []matchgroup => + if (expected == matchres::NOMATCH) { + fmt::fatal("Expected expression /{}/ to not match string \"{}\", but it did", + expr, string); + }; + if (expected == matchres::ERROR) { + fmt::fatal("Expression /{}/ matched, but should have errored", + expr, string); + }; + if (start: size != m[0].start) { + fmt::fatal("Expected start of main match group to be {} but it was {}", + start, m[0].start); + }; + if (end: size != m[0].end) { + fmt::fatal("Expected end of main match group to be {} but it was {}", + end, m[0].end); + }; + + case let e: error => + if (expected == matchres::MATCH) { + fmt::fatal("Expected expression /{}/ to match, but it errored", + expr, string); + }; + if (expected == matchres::NOMATCH) { + fmt::fatal("Expected expression /{}/ to not match, but it errored", + expr, string); + }; + }; +}; + +fn run_findall_case( + expr: str, + string: str, + expected: matchres, + count: int +) void = { + const re = match (compile(expr)) { + case let re: regex => yield re; + case let e: error => + if (expected == matchres::MATCH) { + fmt::println(e)!; + fmt::fatal("Expected expression /{}/ to match, but it errored", + expr, string); + }; + if (expected == matchres::NOMATCH) { + fmt::println(e)!; + fmt::fatal("Expected expression /{}/ to not match, but it errored", + expr, string); + }; + return; + }; + + match (findall(re, string)) { + case void => + if (expected == matchres::MATCH) { + fmt::fatal("Expected expression /{}/ to match string \"{}\", but it did not", + expr, string); + }; + if (expected == matchres::ERROR) { + fmt::fatal("Expression /{}/ failed to match, but should have errored", + expr, string); + }; + + case let groupsets: [][]matchgroup => + if (expected == matchres::NOMATCH) { + fmt::fatal("Expected expression /{}/ to not match string \"{}\", but it did", + expr, string); + }; + if (expected == matchres::ERROR) { + fmt::fatal("Expression /{}/ matched, but should have errored", + expr, string); + }; + if (count: size != len(groupsets)) { + fmt::fatal("Expected to find {} matches but found {}", + count, len(groupsets)); + }; + + case let e: error => + if (expected == matchres::MATCH) { + fmt::fatal("Expected expression /{}/ to match, but it errored", + expr, string); + }; + if (expected == matchres::NOMATCH) { + fmt::fatal("Expected expression /{}/ to not match, but it errored", + expr, string); + }; + }; +}; + +@test fn find() void = { + const cases = [ + // literals + (`^$`, "", matchres::MATCH, 0, 0), + (``, "", matchres::MATCH, 0, -1), + (`abcd`, "abcd", matchres::MATCH, 0, -1), + (`abc`, "abcd", matchres::MATCH, 0, 3), + (`bcd`, "abcd", matchres::MATCH, 1, 4), + (`^abc$`, "abc", matchres::MATCH, 0, -1), + (`^abc$`, "axc", matchres::NOMATCH, 0, -1), + // . + (`^.$`, "x", matchres::MATCH, 0, 1), + (`^.$`, "y", matchres::MATCH, 0, 1), + (`^.$`, "", matchres::NOMATCH, 0, 1), + // + + (`^a+$`, "a", matchres::MATCH, 0, 1), + (`^a+$`, "aaa", matchres::MATCH, 0, 3), + (`^a+$`, "", matchres::NOMATCH, 0, 0), + (`^(abc)+$`, "abc", matchres::MATCH, 0, 3), + (`^(abc)+$`, "abcabc", matchres::MATCH, 0, 6), + (`^(abc)+$`, "", matchres::NOMATCH, 0, 0), + // * + (`^a*$`, "", matchres::MATCH, 0, 0), + (`^a*$`, "aaaa", matchres::MATCH, 0, 4), + (`^a*$`, "b", matchres::NOMATCH, 0, 0), + (`^(abc)*$`, "", matchres::MATCH, 0, 0), + (`^(abc)*$`, "abc", matchres::MATCH, 0, 3), + (`^(abc)*$`, "abcabc", matchres::MATCH, 0, 6), + (`^(abc)*$`, "bbb", matchres::NOMATCH, 0, 3), + // ? + (`^a?$`, "", matchres::MATCH, 0, 0), + (`^a?$`, "a", matchres::MATCH, 0, 1), + (`^a?$`, "b", matchres::NOMATCH, 0, 0), + (`^(abc)?$`, "", matchres::MATCH, 0, 0), + (`^(abc)?$`, "abc", matchres::MATCH, 0, 3), + (`^(abc)?$`, "bbb", matchres::NOMATCH, 0, 0), + // ^ and $ + (`^a*`, "aaaa", matchres::MATCH, 0, 4), + (`a*$`, "aaaa", matchres::MATCH, 0, 4), + (`^a*$`, "aaaa", matchres::MATCH, 0, 4), + (`a*`, "aaaa", matchres::MATCH, 0, 4), + (`b*`, "aaaabbbb", matchres::MATCH, 4, 8), + (`^b*`, "aaaabbbb", matchres::MATCH, 0, 0), + (`b*$`, "aaaabbbb", matchres::MATCH, 4, 8), + // (a|b) + (`^(cafe|b)x$`, "cafex", matchres::MATCH, 0, 5), + (`^(cafe|b)x$`, "bx", matchres::MATCH, 0, 2), + (`^(cafe|b)x$`, "XXXx", matchres::NOMATCH, 0, 0), + (`^(cafe|b)x$`, "bx", matchres::MATCH, 0, 2), + ( + `^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`, + "Jagdhaftpflichtversicherungsbetrag", + matchres::MATCH, 0, -1 + ), + ( + `^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`, + "Jagdhaftpflichtversicherungsbetrug", + matchres::NOMATCH, 0, -1 + ), + ( + `^(Privat|Jagd)(haftpflicht|schaden)versicherungs(police|betrag)$`, + "Jagdversicherungspolice", + matchres::NOMATCH, 0, -1 + ), + (`)`, "", matchres::ERROR, 0, 0), + // [abc] + (`^test[abc]$`, "testa", matchres::MATCH, 0, -1), + (`^test[abc]$`, "testb", matchres::MATCH, 0, -1), + (`^test[abc]$`, "testc", matchres::MATCH, 0, -1), + (`^test[abc]$`, "testd", matchres::NOMATCH, 0, -1), + (`^test[abc]*$`, "test", matchres::MATCH, 0, -1), + (`^test[abc]*$`, "testa", matchres::MATCH, 0, -1), + (`^test[abc]*$`, "testaaa", matchres::MATCH, 0, -1), + (`^test[abc]*$`, "testabc", matchres::MATCH, 0, -1), + (`^test[abc]?$`, "test", matchres::MATCH, 0, -1), + (`^test[abc]?$`, "testa", matchres::MATCH, 0, -1), + (`^test[abc]+$`, "testa", matchres::MATCH, 0, -1), + (`^test[abc]+$`, "test", matchres::NOMATCH, 0, -1), + (`^test[]abc]$`, "test]", matchres::MATCH, 0, -1), + (`^test[[abc]$`, "test[", matchres::MATCH, 0, -1), + (`^test[^abc]$`, "testd", matchres::MATCH, 0, -1), + (`^test[^abc]$`, "test!", matchres::MATCH, 0, -1), + (`^test[^abc]$`, "testa", matchres::NOMATCH, 0, -1), + (`^test[^abc]$`, "testb", matchres::NOMATCH, 0, -1), + (`^test[^abc]$`, "testc", matchres::NOMATCH, 0, -1), + (`^test[^]abc]$`, "test]", matchres::NOMATCH, 0, -1), + (`^test[^abc[]$`, "test[", matchres::NOMATCH, 0, -1), + (`^test[^abc]*$`, "testd", matchres::MATCH, 0, -1), + (`^test[^abc]*$`, "testqqqqq", matchres::MATCH, 0, -1), + (`^test[^abc]*$`, "test", matchres::MATCH, 0, -1), + (`^test[^abc]*$`, "testc", matchres::NOMATCH, 0, -1), + (`^test[^abc]?$`, "test", matchres::MATCH, 0, -1), + (`^test[^abc]?$`, "testd", matchres::MATCH, 0, -1), + (`^test[^abc]?$`, "testc", matchres::NOMATCH, 0, -1), + (`^test[^abc]+$`, "testd", matchres::MATCH, 0, -1), + (`^test[^abc]+$`, "testddd", matchres::MATCH, 0, -1), + (`^test[^abc]+$`, "testc", matchres::NOMATCH, 0, -1), + (`^test[^abc]+$`, "testcccc", matchres::NOMATCH, 0, -1), + (`^test[a-c]$`, "testa", matchres::MATCH, 0, -1), + (`^test[a-c]$`, "testb", matchres::MATCH, 0, -1), + (`^test[a-c]$`, "testc", matchres::MATCH, 0, -1), + (`^test[a-c]$`, "testd", matchres::NOMATCH, 0, -1), + (`^test[a-c]$`, "test!", matchres::NOMATCH, 0, -1), + (`^test[a-c]$`, "test-", matchres::NOMATCH, 0, -1), + (`^test[-a-c]$`, "test-", matchres::MATCH, 0, -1), + (`^test[a-c-]$`, "test-", matchres::MATCH, 0, -1), + (`^test[a-c]*$`, "test", matchres::MATCH, 0, -1), + (`^test[a-c]*$`, "testa", matchres::MATCH, 0, -1), + (`^test[a-c]*$`, "testabb", matchres::MATCH, 0, -1), + (`^test[a-c]*$`, "testddd", matchres::NOMATCH, 0, -1), + (`^test[a-c]?$`, "test", matchres::MATCH, 0, -1), + (`^test[a-c]?$`, "testb", matchres::MATCH, 0, -1), + (`^test[a-c]?$`, "testd", matchres::NOMATCH, 0, -1), + (`^test[a-c]+$`, "test", matchres::NOMATCH, 0, -1), + (`^test[a-c]+$`, "testbcbc", matchres::MATCH, 0, -1), + (`^test[a-c]+$`, "testd", matchres::NOMATCH, 0, -1), + (`^test[^a-c]$`, "testa", matchres::NOMATCH, 0, -1), + (`^test[^a-c]$`, "testb", matchres::NOMATCH, 0, -1), + (`^test[^a-c]$`, "testc", matchres::NOMATCH, 0, -1), + (`^test[^a-c]$`, "testd", matchres::MATCH, 0, -1), + (`^test[^a-c]$`, "test!", matchres::MATCH, 0, -1), + (`^test[^a-c]$`, "test-", matchres::MATCH, 0, -1), + (`^test[^-a-c]$`, "test-", matchres::NOMATCH, 0, -1), + (`^test[^a-c-]$`, "test-", matchres::NOMATCH, 0, -1), + (`^test[^a-c-]*$`, "test", matchres::MATCH, 0, -1), + (`^test[^a-c-]*$`, "test--", matchres::NOMATCH, 0, -1), + (`^test[^a-c-]*$`, "testq", matchres::MATCH, 0, -1), + (`^test[^a-c-]?$`, "test", matchres::MATCH, 0, -1), + (`^test[^a-c-]?$`, "testq", matchres::MATCH, 0, -1), + (`^test[^a-c-]?$`, "test-", matchres::NOMATCH, 0, -1), + (`^test[^a-c-]+$`, "test", matchres::NOMATCH, 0, -1), + (`^test[^a-c-]+$`, "testb", matchres::NOMATCH, 0, -1), + (`^test[^a-c-]+$`, "testddd", matchres::MATCH, 0, -1), + (`([a-z][a-z0-9]*,)+`, "a5,b7,c9,", matchres::MATCH, 0, -1), + // [:alpha:] etc. + (`^test[[:alnum:]]+$`, "testaA1", matchres::MATCH, 0, -1), + (`^test[[:alnum:]]+$`, "testa_1", matchres::NOMATCH, 0, -1), + (`^test[[:alpha:]]+$`, "testa", matchres::MATCH, 0, -1), + (`^test[[:alpha:]]+$`, "testa1", matchres::NOMATCH, 0, -1), + (`^test[[:blank:]]+$`, "testa", matchres::NOMATCH, 0, -1), + (`^test[[:blank:]]+$`, "test ", matchres::MATCH, 0, -1), + (`^test[^[:blank:]]+$`, "testx", matchres::MATCH, 0, -1), + (`^test[[:blank:]]+$`, "test ", matchres::MATCH, 0, -1), + (`^test[^[:cntrl:]]+$`, "testa", matchres::MATCH, 0, -1), + (`^test[[:digit:]]$`, "test1", matchres::MATCH, 0, -1), + (`^test[[:digit:]]$`, "testa", matchres::NOMATCH, 0, -1), + (`^test[[:graph:]]+$`, "test\t", matchres::NOMATCH, 0, -1), + (`^test[[:lower:]]+$`, "testa", matchres::MATCH, 0, -1), + (`^test[[:lower:]]+$`, "testA", matchres::NOMATCH, 0, -1), + (`^test[[:print:]]+$`, "test\t", matchres::NOMATCH, 0, -1), + (`^test[[:punct:]]+$`, "testA", matchres::NOMATCH, 0, -1), + (`^test[[:punct:]]+$`, "test!", matchres::MATCH, 0, -1), + (`^test[[:space:]]+$`, "test ", matchres::MATCH, 0, -1), + (`^test[[:upper:]]+$`, "testa", matchres::NOMATCH, 0, -1), + (`^test[[:upper:]]+$`, "testA", matchres::MATCH, 0, -1), + (`^test[[:word:]]+$`, "test!2", matchres::NOMATCH, 0, -1), + (`^test[[:word:]]+$`, "test_2", matchres::MATCH, 0, -1), + (`^test[[:xdigit:]]+$`, "testCAFE", matchres::MATCH, 0, -1), + // [:alpha:] etc. plus extra characters + (`^test[[:digit:]][[:alpha:]]$`, "test1a", matchres::MATCH, 0, -1), + (`^test[[:digit:]][[:alpha:]]$`, "testa1", matchres::NOMATCH, 0, -1), + (`^test[[:alnum:]!]+$`, "testa!1", matchres::MATCH, 0, -1), + (`^test[@[:alnum:]!]+$`, "testa!@1", matchres::MATCH, 0, -1), + // Escaped characters such as \+ + (`^a\+b$`, "a+b", matchres::MATCH, 0, -1), + (`^a\?b$`, "a?b", matchres::MATCH, 0, -1), + (`^a\*b$`, "a*b", matchres::MATCH, 0, -1), + (`^a\^b$`, "a^b", matchres::MATCH, 0, -1), + (`^a\$b$`, "a$b", matchres::MATCH, 0, -1), + (`^a\[b$`, "a[b", matchres::MATCH, 0, -1), + (`^a\]b$`, "a]b", matchres::MATCH, 0, -1), + (`^a\(b$`, "a(b", matchres::MATCH, 0, -1), + (`^a\)b$`, "a)b", matchres::MATCH, 0, -1), + (`^a\|b$`, "a|b", matchres::MATCH, 0, -1), + (`^a\.b$`, "a.b", matchres::MATCH, 0, -1), + (`^a\\b$`, "a\\b", matchres::MATCH, 0, -1), + // {m,n} + (`^x(abc){1,2}$`, "xabc", matchres::MATCH, 0, -1), + (`^x(abc){1,2}$`, "xabcabc", matchres::MATCH, 0, -1), + (`^x(abc){1,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1), + (`^x(abc){,2}$`, "xabc", matchres::MATCH, 0, -1), + (`^x(abc){,2}$`, "xabcabc", matchres::MATCH, 0, -1), + (`^x(abc){,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1), + (`^x(abc){1,}$`, "xabc", matchres::MATCH, 0, -1), + (`^x(abc){1,}$`, "xabcabc", matchres::MATCH, 0, -1), + (`^x(abc){3,}$`, "xabcabc", matchres::NOMATCH, 0, -1), + (`^x(abc){3,}$`, "xabcabcabc", matchres::MATCH, 0, -1), + (`^x(abc){2,2}$`, "xabcabc", matchres::MATCH, 0, -1), + (`^x(abc){2,2}$`, "xabc", matchres::NOMATCH, 0, -1), + (`^x(abc){2,2}$`, "xabcabcabc", matchres::NOMATCH, 0, -1), + (`^x(abc){-1,2}$`, "xabcabcabc", matchres::ERROR, 0, -1), + (`^x(abc){x,2}$`, "xabcabcabc", matchres::ERROR, 0, -1), + (`^x(abc){0,-2}$`, "xabcabcabc", matchres::ERROR, 0, -1), + // various + ( + `^.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+$`, + "X1024facefacecaaaaafffcafebabebabe", + matchres::MATCH, 0, -1, + ), + ( + `.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+`, + "X1024facefacecaaaaafffcafebabebabe", + matchres::MATCH, 0, -1, + ), + ( + `^.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+$`, + "1024facefacecaaaaafffcafebabebabe", + matchres::NOMATCH, 0, 0, + ), + ( + `.(1024)?(face)*(1024)*ca*(f+e?cafe)(babe)+`, + "1024facefacecaaaaafffcafebabebabe", + matchres::MATCH, 3, -1, + ), + ( + `^([a-zA-Z]{1,2}[[:digit:]]{1,2})[[:space:]]*([[:digit:]][a-zA-Z]{2})$`, + "M15 4QN", + matchres::MATCH, 0, -1 + ), + // tests from perl + (`abc`, "abc", matchres::MATCH, 0, -1), + (`abc`, "xbc", matchres::NOMATCH, 0, 0), + (`abc`, "axc", matchres::NOMATCH, 0, 0), + (`abc`, "abx", matchres::NOMATCH, 0, 0), + (`abc`, "xabcy", matchres::MATCH, 1, 4), + (`abc`, "ababc", matchres::MATCH, 2, -1), + (`ab*c`, "abc", matchres::MATCH, 0, -1), + (`ab*bc`, "abc", matchres::MATCH, 0, -1), + (`ab*bc`, "abbc", matchres::MATCH, 0, -1), + (`ab*bc`, "abbbbc", matchres::MATCH, 0, -1), + (`ab{0,}bc`, "abbbbc", matchres::MATCH, 0, -1), + (`ab+bc`, "abbc", matchres::MATCH, 0, -1), + (`ab+bc`, "abc", matchres::NOMATCH, 0, 0), + (`ab+bc`, "abq", matchres::NOMATCH, 0, 0), + (`ab{1,}bc`, "abq", matchres::NOMATCH, 0, 0), + (`ab+bc`, "abbbbc", matchres::MATCH, 0, -1), + (`ab{1,}bc`, "abbbbc", matchres::MATCH, 0, -1), + (`ab{1,3}bc`, "abbbbc", matchres::MATCH, 0, -1), + (`ab{3,4}bc`, "abbbbc", matchres::MATCH, 0, -1), + (`ab{4,5}bc`, "abbbbc", matchres::NOMATCH, 0, 0), + (`ab?bc`, "abbc", matchres::MATCH, 0, -1), + (`ab?bc`, "abc", matchres::MATCH, 0, -1), + (`ab{0,1}bc`, "abc", matchres::MATCH, 0, -1), + (`ab?bc`, "abbbbc", matchres::NOMATCH, 0, 0), + (`ab?c`, "abc", matchres::MATCH, 0, -1), + (`ab{0,1}c`, "abc", matchres::MATCH, 0, -1), + (`^abc$`, "abc", matchres::MATCH, 0, -1), + (`^abc$`, "abcc", matchres::NOMATCH, 0, 0), + (`^abc`, "abcc", matchres::MATCH, 0, 3), + (`^abc$`, "aabc", matchres::NOMATCH, 0, 0), + (`abc$`, "aabc", matchres::MATCH, 1, -1), + (`^`, "abc", matchres::MATCH, 0, 0), + (`$`, "abc", matchres::MATCH, 3, 3), + (`a.c`, "abc", matchres::MATCH, 0, -1), + (`a.c`, "axc", matchres::MATCH, 0, -1), + (`a.*c`, "axyzc", matchres::MATCH, 0, -1), + (`a.*c`, "axyzd", matchres::NOMATCH, 0, 0), + (`a[bc]d`, "abc", matchres::NOMATCH, 0, 0), + (`a[bc]d`, "abd", matchres::MATCH, 0, -1), + (`a[b-d]e`, "abd", matchres::NOMATCH, 0, 0), + (`a[b-d]e`, "ace", matchres::MATCH, 0, -1), + (`a[b-d]`, "aac", matchres::MATCH, 1, -1), + (`a[-b]`, "a-", matchres::MATCH, 0, -1), + (`a[b-]`, "a-", matchres::MATCH, 0, -1), + (`a[b-a]`, "-", matchres::ERROR, 0, 0), + (`a[]b`, "-", matchres::ERROR, 0, 0), + (`a[`, "-", matchres::ERROR, 0, 0), + (`a]`, "a]", matchres::MATCH, 0, -1), + (`a[]]b`, "a]b", matchres::MATCH, 0, -1), + (`a[^bc]d`, "aed", matchres::MATCH, 0, -1), + (`a[^bc]d`, "abd", matchres::NOMATCH, 0, 0), + (`a[^-b]c`, "adc", matchres::MATCH, 0, -1), + (`a[^-b]c`, "a-c", matchres::NOMATCH, 0, 0), + (`a[^]b]c`, "a]c", matchres::NOMATCH, 0, 0), + (`a[^]b]c`, "adc", matchres::MATCH, 0, -1), + (`()ef`, "def", matchres::MATCH, 1, -1), + (`*a`, "-", matchres::ERROR, 0, 0), + (`(*)b`, "-", matchres::ERROR, 0, 0), + (`$b`, "b", matchres::ERROR, 0, 0), + (`a\`, "-", matchres::ERROR, 0, 0), + (`a\(b`, "a(b", matchres::MATCH, 0, -1), + (`a\(*b`, "ab", matchres::MATCH, 0, -1), + (`a\(*b`, "a((b", matchres::MATCH, 0, -1), + (`a\\b`, `a\b`, matchres::MATCH, 0, -1), + (`abc)`, "-", matchres::ERROR, 0, 0), + (`(abc`, "-", matchres::ERROR, 0, 0), + (`(a)b(c)`, "abc", matchres::MATCH, 0, -1), + (`a+b+c`, "aabbabc", matchres::MATCH, 4, -1), + (`a{1,}b{1,}c`, "aabbabc", matchres::MATCH, 4, -1), + (`a**`, "-", matchres::ERROR, 0, 0), + (`)(`, "-", matchres::ERROR, 0, 0), + (`[^ab]*`, "cde", matchres::MATCH, 0, -1), + (`abc`, "", matchres::NOMATCH, 0, 0), + (`a*`, "", matchres::MATCH, 0, -1), + (`([abc])*d`, "abbbcd", matchres::MATCH, 0, -1), + (`([abc])*bcd`, "abcd", matchres::MATCH, 0, -1), + (`abcd*efg`, "abcdefg", matchres::MATCH, 0, -1), + (`ab*`, "xabyabbbz", matchres::MATCH, 1, 3), + (`ab*`, "xayabbbz", matchres::MATCH, 1, 2), + (`(ab|cd)e`, "abcde", matchres::MATCH, 2, -1), + (`[abhgefdc]ij`, "hij", matchres::MATCH, 0, -1), + (`^(ab|cd)e`, "abcde", matchres::NOMATCH, 0, 0), + (`(abc|)ef`, "abcdef", matchres::MATCH, 4, -1), + (`(a|b)c*d`, "abcd", matchres::MATCH, 1, -1), + (`(ab|ab*)bc`, "abc", matchres::MATCH, 0, -1), + (`a([bc]*)c*`, "abc", matchres::MATCH, 0, -1), + (`a([bc]*)(c*d)`, "abcd", matchres::MATCH, 0, -1), + (`a([bc]+)(c*d)`, "abcd", matchres::MATCH, 0, -1), + (`a([bc]*)(c+d)`, "abcd", matchres::MATCH, 0, -1), + (`a[bcd]*dcdcde`, "adcdcde", matchres::MATCH, 0, -1), + (`a[bcd]+dcdcde`, "adcdcde", matchres::NOMATCH, 0, 0), + (`(ab|a)b*c`, "abc", matchres::MATCH, 0, -1), + (`[a-zA-Z_][a-zA-Z0-9_]*`, "alpha", matchres::MATCH, 0, -1), + (`^a(bc+|b[eh])g|.h$`, "abh", matchres::MATCH, 0, -1), + (`multiple words of text`, "uh-uh", matchres::NOMATCH, 0, 0), + (`multiple words`, "multiple words, yeah", matchres::MATCH, 0, 14), + (`(.*)c(.*)`, "abcde", matchres::MATCH, 0, -1), + (`\((.*), (.*)\)`, "(a, b)", matchres::MATCH, 0, -1), + (`[k]`, "ab", matchres::NOMATCH, 0, 0), + (`a[-]?c`, "ac", matchres::MATCH, 0, -1), + (`.*d`, "abc\nabd", matchres::MATCH, 0, -1), + (`(`, "", matchres::ERROR, 0, 0), + (`(x?)?`, "x", matchres::MATCH, 0, -1), + (`^*`, "", matchres::ERROR, 0, 0), + // Submatch handling + (`(a|ab)(c|bcd)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(a|ab)(bcd|c)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(ab|a)(c|bcd)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(ab|a)(bcd|c)(d*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(a*)(b|abc)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3) + (`(a*)(abc|b)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3) + (`(a*)(b|abc)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3) + (`(a*)(abc|b)(c*)`, "abc", matchres::MATCH, 0, -1), // POSIX: (0,3)(0,1)(1,2)(2,3) + (`(a|ab)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(a|ab)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(ab|a)(c|bcd)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + (`(ab|a)(bcd|c)(d|.*)`, "abcd", matchres::MATCH, 0, -1), // POSIX: (0,4)(0,2)(2,3)(3,4) + // TODO: whole-expression alternation + // (`ab|cd`, "abc", matchres::MATCH, 0, -1), + // (`ab|cd`, "abcd", matchres::MATCH, 0, -1), + // TODO: multiple alternation + // (`a|b|c|d|e`, "e", matchres::MATCH, 0, -1), + // (`(a|b|c|d|e)f`, "ef", matchres::MATCH, 0, -1), + // TODO: nested capture groups + // (`((a))`, "abc", matchres::MATCH, 0, -1), + // (`((a)(b)c)(d)`, "abcd", matchres::MATCH, 0, -1), + // (`(bc+d$|ef*g.|h?i(j|k))`, "effgz", matchres::MATCH, 0, -1), + // (`(bc+d$|ef*g.|h?i(j|k))`, "ij", matchres::MATCH, 0, -1), + // (`(bc+d$|ef*g.|h?i(j|k))`, "effg", matchres::NOMATCH, 0, 0), + // (`(bc+d$|ef*g.|h?i(j|k))`, "bcdd", matchres::NOMATCH, 0, 0), + // (`(bc+d$|ef*g.|h?i(j|k))`, "reffgz", matchres::MATCH, 0, -1), + // (`((((((((((a))))))))))`, "a", matchres::MATCH, 0, -1), + // (`(((((((((a)))))))))`, "a", matchres::MATCH, 0, -1), + // (`(([a-z]+):)?([a-z]+)$`, "smil", matchres::MATCH, 0, -1), + // (`^((a)c)?(ab)$`, "ab", matchres::MATCH, 0, -1), + // TODO: multiple simultaneous capture groups + // (`(a+|b)*`, "ab", matchres::MATCH, 0, -1), + // (`(a+|b){0,}`, "ab", matchres::MATCH, 0, -1), + // (`(a+|b)+`, "ab", matchres::MATCH, 0, -1), + // (`(a+|b){1,}`, "ab", matchres::MATCH, 0, -1), + // (`(a+|b)?`, "ab", matchres::MATCH, 0, -1), + // (`(a+|b){0,1}`, "ab", matchres::MATCH, 0, -1), + // NOTE: character sequences not currently supported + // (`\0`, "\0", matchres::MATCH, 0, -1), + // (`[\0a]`, "\0", matchres::MATCH, 0, -1), + // (`[a\0]`, "\0", matchres::MATCH, 0, -1), + // (`[^a\0]`, "\0", matchres::NOMATCH, 0, 0), + // NOTE: octal sequences not currently supported + // (`[\1]`, "\1", matchres::MATCH, 0, -1), + // (`\09`, "\0(separate-me)9", matchres::MATCH, 0, -1), + // (`\141`, "a", matchres::MATCH, 0, -1), + // (`[\41]`, "!", matchres::MATCH, 0, -1), + // NOTE: hex sequences not currently supported + // (`\xff`, "\377", matchres::MATCH, 0, -1), + // NOTE: non-greedy matching not currently supported + // (`a.+?c`, "abcabc", matchres::MATCH, 0, -1), + // (`.*?\S *:`, "xx:", matchres::MATCH, 0, -1), + // (`a[ ]*?\ (\d+).*`, "a 10", matchres::MATCH, 0, -1), + // (`a[ ]*?\ (\d+).*`, "a 10", matchres::MATCH, 0, -1), + // (`"(\\"|[^"])*?"`, `"\""`, matchres::MATCH, 0, -1), + // (`^.*?$`, "one\ntwo\nthree\n", matchres::NOMATCH, 0, 0), + // (`a[^>]*?b`, "a>b", matchres::NOMATCH, 0, 0), + // (`^a*?$`, "foo", matchres::NOMATCH, 0, 0), + // (`^([ab]*?)(?=(b)?)c`, "abc", matchres::MATCH, 0, -1), + // (`^([ab]*?)(?!(b))c`, "abc", matchres::MATCH, 0, -1), + // (`^([ab]*?)(?<!(a))c`, "abc", matchres::MATCH, 0, -1), + ]; + + for (let i = 0z; i < len(cases); i += 1) { + const expr = cases[i].0; + const string = cases[i].1; + const should_match = cases[i].2; + const start = cases[i].3; + const end = if (cases[i].4 == -1) { + yield len(string): int; + } else { + yield cases[i].4; + }; + run_find_case(expr, string, should_match, start, end); + }; + + if (PERFTEST_ON) { + const f = os::open("compress/zlib/data+test.ha")!; + const buf: [2000000]u8 = [0...]; + io::read(f, buf)!; + const test_data = strings::fromutf8(buf); + + const perf_cases = [ + ( + `^a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*a*aaaaaaaaaaaaaaaa$`, + "aaaaaaaaaaaaaaaa", + matchres::MATCH, + 0, -1 + ), + (`.*.*.*rand_out`, test_data, matchres::MATCH, 0, 1211255), + (`0x`, test_data, matchres::MATCH, 86, 88), + ]; + + for (let i = 0z; i < len(perf_cases); i += 1) { + const expr = perf_cases[i].0; + const string = perf_cases[i].1; + const should_match = perf_cases[i].2; + const start = perf_cases[i].3; + const end = if (perf_cases[i].4 == -1) { + yield len(string): int; + } else { + yield perf_cases[i].4; + }; + run_find_case(expr, string, should_match, start, end); + }; + }; +}; + +@test fn findall() void = { + const cases = [ + (`ab.`, "hello abc and abz test abq thanks", matchres::MATCH, 3), + ]; + + for (let i = 0z; i < len(cases); i += 1) { + const expr = cases[i].0; + const string = cases[i].1; + const should_match = cases[i].2; + const count = cases[i].3; + run_findall_case(expr, string, should_match, count); + }; + + if (PERFTEST_ON) { + const f = os::open("compress/zlib/data+test.ha")!; + const buf: [2000000]u8 = [0...]; + io::read(f, buf)!; + const test_data = strings::fromutf8(buf); + + const perf_cases = [ + (`0x`, test_data, matchres::MATCH, 199053), + ]; + + for (let i = 0z; i < len(perf_cases); i += 1) { + const expr = perf_cases[i].0; + const string = perf_cases[i].1; + const should_match = perf_cases[i].2; + const count = perf_cases[i].3; + run_findall_case(expr, string, should_match, count); + }; + }; +}; diff --git a/regex/README b/regex/README @@ -0,0 +1,41 @@ +This is a NFA-based regex implementation. It closely adheres to the POSIX +Extended Regular Expressions specification [0]. Matching is guaranteed to run in +linear time, and various optimisations have been implemented to ensure good +performance on most inputs. + +By default, matches will be found anywhere in the given string. The ^ and $ +characters can be used to anchor the match to the beginning or end of the +string. + +find() returns a slice of [[regex::matchgroup]]s for the first match. The +first [[regex::matchgroup]] represents the entire match, while the rest +represent the submatches, specified in the expression using (parens). + +findall() finds all non-overlapping matches in the given string and returns +a slice of slices of [[regex::matchgroup]]s. + +This module implements the POSIX match disambiguation rules by returning +the longest match among the leftmost matches. + + const re = regex::compile(`[Hh]are`)!; + defer regex::regex_free(re); + + const first_match = regex::find(re, "Hello Hare, hello Hare.")!; + match (first_match) { + case void => void; + case let groups: []regex::matchgroup => + defer free(groups); + // The match groups provide the content, start index and end index of + // the main match, as well as all submatches. + }; + + const all_matches = regex::findall(re, "Hello hare, hello hare.")!; + match (all_matches) { + case void => void; + case let groupsets: [][]regex::matchgroup => + defer regex::freeall(groupsets); + // A slice of multiple match group sets, which can be used similarly + // to the find() example. + }; + +[0]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04 diff --git a/regex/regex.ha b/regex/regex.ha @@ -0,0 +1,813 @@ +// License: MPL-2.0 +// (c) 2022 Vlad-Stefan Harbuz <vlad@vladh.net> +use ascii; +use encoding::utf8; +use errors; +use strconv; +use strings; + +// A string describing the error the occurred. +export type error = !str; + +export type match_type = enum { ANCHORED, FLOATING }; +export type inst_lit = rune, + inst_charset = struct { idx: size, is_positive: bool }, + inst_any = void, + inst_split = size, + inst_jump = size, + inst_skip = void, + inst_match = match_type, + inst_groupstart = void, + inst_groupend = void, + inst_repeat = struct { + id: size, + origin: size, + min: (void | size), + max: (void | size), + }; +export type inst = (inst_lit | inst_any | inst_split | inst_jump | + inst_skip | inst_match | inst_charset | + inst_groupstart | inst_groupend | + inst_repeat); + +// A (sub)match found as a result of matching a certain string against a regex. +export type matchgroup = struct { + content: str, + start: size, + end: size, +}; + +type thread = struct { + pc: size, + start_idx: size, + root_group: matchgroup, + groups: []matchgroup, + curr_group: matchgroup, + curr_group_inited: bool, + rep_counters: []size, + matched: bool, + failed: bool, +}; + +type newmatch = void; + +export type charclass = enum { + ALNUM, ALPHA, ASCII, BLANK, CNTRL, DIGIT, GRAPH, LOWER, PRINT, PUNCT, + SPACE, UPPER, WORD, XDIGIT, +}; +export type charset = [](charset_lit_item | charset_range_item | + charset_class_item), + charset_lit_item = rune, + charset_range_item = (u8, u8), + charset_class_item = charclass; +const charclass_names: [](charclass, str) = [ + (charclass::ALNUM, ":alnum:]"), + (charclass::ALPHA, ":alpha:]"), + (charclass::ASCII, ":ascii:]"), + (charclass::BLANK, ":blank:]"), + (charclass::CNTRL, ":cntrl:]"), + (charclass::DIGIT, ":digit:]"), + (charclass::GRAPH, ":graph:]"), + (charclass::LOWER, ":lower:]"), + (charclass::PRINT, ":print:]"), + (charclass::PUNCT, ":punct:]"), + (charclass::SPACE, ":space:]"), + (charclass::UPPER, ":upper:]"), + (charclass::WORD, ":word:]"), + (charclass::XDIGIT, ":xdigit:]"), +]; +const charclass_fns: [](charclass, *fn(c: rune) bool) = [ + (charclass::ALNUM, &ascii::isalnum), + (charclass::ALPHA, &ascii::isalpha), + (charclass::ASCII, &ascii::isascii), + (charclass::BLANK, &ascii::isblank), + (charclass::CNTRL, &ascii::iscntrl), + (charclass::DIGIT, &ascii::isdigit), + (charclass::GRAPH, &ascii::isgraph), + (charclass::LOWER, &ascii::islower), + (charclass::PRINT, &ascii::isprint), + (charclass::PUNCT, &ascii::ispunct), + (charclass::SPACE, &ascii::isspace), + (charclass::UPPER, &ascii::isupper), + (charclass::WORD, &isword), + (charclass::XDIGIT, &ascii::isxdigit), +]; +const multibyte_err: error = "Character ranges do not support characters larger than one byte."; + +export type regex = struct { + insts: []inst, + charsets: []charset, + n_reps: size, +}; + +// Frees the memory used by a regex. +export fn regex_free(re: regex) void = { + free(re.insts); + for (let i = 0z; i < len(re.charsets); i += 1) { + free(re.charsets[i]); + }; + free(re.charsets); +}; + +fn find_last_groupstart(insts: *[]inst) (size | error) = { + for (let i = len(insts); i > 0; i -= 1) { + if (insts[i - 1] is inst_groupstart) { + return i - 1; + }; + }; + return `Encountered ")" token without matching "("`: error; +}; + +fn isword(c: rune) bool = ascii::isalnum(c) || c == '_'; + +fn handle_bracket( + insts: *[]inst, + r: rune, + r_idx: *size, + bracket_idx: *int, + iter: *strings::iterator, + charsets: *[]charset, + skip_charclass_rest: *bool, + is_charset_positive: *bool, + in_bracket: *bool +) (void | error) = { + const peek1 = strings::next(iter); + const peek2 = strings::next(iter); + const peek3 = strings::next(iter); + if (!(peek1 is void)) { + strings::prev(iter); + }; + if (!(peek2 is void)) { + strings::prev(iter); + }; + if (!(peek3 is void)) { + strings::prev(iter); + }; + + if (*bracket_idx == -1) { + append(charsets, alloc([])); + }; + *bracket_idx += 1; + + if (*skip_charclass_rest) { + if (r == ']') { + *skip_charclass_rest = false; + }; + *r_idx += 1; + return; + }; + + const is_range = peek1 is rune && peek1 as rune == '-' && + !(peek2 is void) && !(peek3 is void); + const range_end = peek2; + const is_first_char = *bracket_idx == 0 || *bracket_idx == 1 && + !*is_charset_positive; + if (r == ']' && !is_first_char) { + const newinst = inst_charset { + idx = len(charsets) - 1, + is_positive = *is_charset_positive, + }; + append(insts, newinst); + *in_bracket = false; + *bracket_idx = -1; + *is_charset_positive = true; + } else if (r == '^' && *bracket_idx == 0) { + *is_charset_positive = false; + } else if (r == '[' && !(peek1 is void) && + peek1 as rune == ':') { + const rest = strings::iterstr(iter); + const n_cc = len(charclass_names); + for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) { + const cc = charclass_names[cc_idx]; + if (strings::hasprefix(rest, cc.1)) { + append(charsets[len(charsets) - 1], + cc.0: charset_class_item); + *skip_charclass_rest = true; + break; + }; + }; + if (!*skip_charclass_rest) { + return `Found "[:" in bracket expression and expected a character class such as [:alpha:], but none was found. If you did not mean to use a charclass, try ":["`: error; + }; + } else if (is_range) { + const start_enc = utf8::encoderune(r); + if (len(start_enc) > 1) { + return multibyte_err; + }; + const start_b = start_enc[0]; + + const end_enc = utf8::encoderune(range_end as rune); + if (len(end_enc) > 1) { + return multibyte_err; + }; + const end_b = end_enc[0]; + + if (end_b < start_b) { + return `Found range in bracket expression where end character was before start character, e.g. "[b-a]"`: error; + }; + + append(charsets[len(charsets) - 1], + (start_b, end_b): charset_range_item); + strings::next(iter); + strings::next(iter); + *r_idx += 2; + } else { + append(charsets[len(charsets) - 1], + r: charset_lit_item); + }; + + *r_idx += 1; +}; + +// Compiles a string containing a regular expression into a regex struct. +export fn compile(expr: str) (regex | error) = { + let insts: []inst = alloc([]); + let charsets: []charset = alloc([]); + let iter = strings::iter(expr); + let r_idx = 0z; + let match_type = match_type::FLOATING; + let curr_alt_jump_idx = -1; + let in_bracket = false; + let skip_charclass_rest = false; + let bracket_idx = -1; + let is_charset_positive = true; + let n_reps = 0z; + let n_groupstarts = 0; + + for (true) { + const next = strings::next(&iter); + + if (r_idx == 0 && next is rune && next: rune != '^') { + append(insts, void: inst_skip); + }; + + if (in_bracket) { + if (next is void) { + return `Found unterminated bracket expression, are you missing a closing "]"?`: error; + }; + const r = next: rune; + handle_bracket(&insts, r, &r_idx, &bracket_idx, &iter, + &charsets, &skip_charclass_rest, + &is_charset_positive, + &in_bracket)?; + continue; + }; + + const r = match (next) { + case void => + if (n_groupstarts > 0) { + return "Expression ended, but there were still unclosed groups": error; + }; + break; + case let r: rune => yield r; + }; + switch (r) { + case '\\' => + const peek1 = strings::next(&iter); + if (peek1 is void) { + return "Found an escaping backslash, but there was nothing to escape": error; + } else { + append(insts, (peek1 as rune): inst_lit); + r_idx += 1; + }; + case '^' => + if (r_idx != 0) { + return `Anchor character "^" may only occur at the start of the expression`: error; + }; + case '$' => + if (r_idx != len(expr) - 1) { + return `Anchor character "$" may only occur at the end of the expression`: error; + }; + match_type = match_type::ANCHORED; + case '[' => + in_bracket = true; + case ']' => + if (in_bracket) { + in_bracket = false; + } else { + append(insts, r: inst_lit); + }; + case '(' => + append(insts, void: inst_groupstart); + n_groupstarts += 1; + case ')' => + if (n_groupstarts == 0) { + return "Tried to close group but none was open": error; + }; + n_groupstarts -= 1; + append(insts, void: inst_groupend); + if (curr_alt_jump_idx != -1) { + assert(insts[curr_alt_jump_idx] is inst_jump); + insts[curr_alt_jump_idx] = + (len(insts) - 1): inst_jump; + curr_alt_jump_idx = -1; + }; + case '|' => + append(insts, 9999999: inst_jump); + const origin = find_last_groupstart(&insts)? + 1; + const newinst = (len(insts) + 1): inst_split; + insert(insts[origin], newinst); + curr_alt_jump_idx = (len(insts) - 1): int; + case '{' => + let origin = len(insts) - 1; + if (insts[origin] is inst_groupend) { + origin = find_last_groupstart(&insts)?; + }; + const rest = strings::iterstr(&iter); + const rep_parts = parse_repetition(rest)?; + const can_skip = rep_parts.0 == 0; + const min = if (rep_parts.0 == 0) { + yield 1z; + } else { + yield rep_parts.0; + }; + if (can_skip) { + insert(insts[origin], + len(insts) + 2: inst_split); + origin += 1; + }; + const newinst = inst_repeat { + id = n_reps, + origin = origin, + min = min, + max = rep_parts.1, + }; + for (let i = 0z; i <= rep_parts.2; i += 1) { + strings::next(&iter); + r_idx += 1; + }; + append(insts, newinst); + n_reps += 1; + case '?' => + if (r_idx == 0 || len(insts) == 0) { + return `Found "?" but there was nothing before it`: error; + }; + let term_start_idx = len(insts) - 1; + match (insts[term_start_idx]) { + case (inst_lit | inst_charset | inst_any) => void; + case inst_groupend => + term_start_idx = find_last_groupstart(&insts)?; + case inst_groupstart => + return `Found "?" but it was in an empty group`: error; + case => + return `Invalid use of "?"`: error; + }; + const after_idx = len(insts) + 1; + insert(insts[term_start_idx], after_idx: inst_split); + case '*' => + if (r_idx == 0 || len(insts) == 0) { + return `Found "*" but there was nothing before it`: error; + }; + const new_inst_offset = 1z; + const jump_idx = len(insts) + new_inst_offset; + const after_idx = jump_idx + 1z; + let term_start_idx = len(insts) - 1z; + match (insts[term_start_idx]) { + case (inst_lit | inst_charset | inst_any) => void; + case inst_groupend => + term_start_idx = find_last_groupstart(&insts)?; + case inst_groupstart => + return `Found "*" but it was in an empty group`: error; + case => + return `Invalid use of "*"`: error; + }; + const split_idx = term_start_idx; + term_start_idx += new_inst_offset; + insert(insts[split_idx], after_idx: inst_split); + append(insts, split_idx: inst_jump); + case '+' => + if (r_idx == 0 || len(insts) == 0) { + return `Found "+" but there was nothing before it`: error; + }; + let term_start_idx = len(insts) - 1; + match (insts[term_start_idx]) { + case (inst_lit | inst_charset | inst_any) => void; + case inst_groupend => + term_start_idx = find_last_groupstart(&insts)?; + case inst_groupstart => + return `Found "+" but it was in an empty group`: error; + case => + return `Invalid use of "+"`: error; + }; + append(insts, term_start_idx: inst_split); + case '.' => + append(insts, void: inst_any); + case => + append(insts, r: inst_lit); + }; + r_idx += 1; + }; + + append(insts, match_type: inst_match); + + return regex { + insts = insts, + charsets = charsets, + n_reps = n_reps, + }; +}; + +fn parse_repetition( + s: str +) (((void | size), (void | size), size) | error) = { + const first_comma = strings::index(s, ","); + const first_endbrace = strings::index(s, "}"); + if (first_endbrace is void) { + return "Invalid repetition value": error; + }; + const first_endbrace = first_endbrace as size; + + let min_str = ""; + let max_str = ""; + let is_single_arg = false; + if (first_comma is void || first_endbrace < first_comma as size) { + const cut = strings::cut(s, "}"); + min_str = cut.0; + max_str = cut.0; + is_single_arg = true; + } else { + const cut = strings::cut(s, ","); + min_str = cut.0; + max_str = strings::cut(cut.1, "}").0; + }; + + let min: (void | size) = void; + let max: (void | size) = void; + + if (len(min_str) > 0) { + min = match (strconv::stoi(min_str)) { + case let res: int => + yield if (res < 0) { + return `Only positive integers are allowed inside "{}"`: error; + } else { + yield res: size; + }; + case => return "Invalid repetition minimum value": error; + }; + }; + + if (len(max_str) > 0) { + max = match (strconv::stoi(max_str)) { + case let res: int => + yield if (res < 0) { + return `Only positive integers are allowed inside "{}"`: error; + } else { + yield res: size; + }; + case => return "Invalid repetition maximum value": error; + }; + }; + + const rep_len = if (is_single_arg) { + yield len(min_str); + } else { + yield len(min_str) + 1 + len(max_str); + }; + return (min, max, rep_len); +}; + +fn delete_thread(i: size, threads: *[]thread) void = { + free(threads[i].groups); + free(threads[i].rep_counters); + delete(threads[i]); +}; + +fn is_consuming_inst(a: inst) bool = { + return a is (inst_lit | inst_any | inst_charset); +}; + +fn add_thread(threads: *[]thread, parent_idx: size, new_pc: size) void = { + // Do not add this thread if there is already another thread with + // the same PC + for (let i = 0z; i < len(threads); i += 1) { + if (threads[i].pc == new_pc && + !threads[i].matched && + threads[i].start_idx < + threads[parent_idx].start_idx) { + return; + }; + }; + + append(threads, thread { + pc = new_pc, + start_idx = threads[parent_idx].start_idx, + curr_group = threads[parent_idx].curr_group, + curr_group_inited = threads[parent_idx].curr_group_inited, + matched = threads[parent_idx].matched, + failed = threads[parent_idx].failed, + groups = alloc(threads[parent_idx].groups...), + rep_counters = alloc(threads[parent_idx].rep_counters...), + ... + }); +}; + +fn run_thread( + i: size, + re: regex, + string: str, + threads: *[]thread, + r_or_end: (rune | void), + str_idx: int +) (void | error | newmatch) = { + if (threads[i].matched) { + return; + }; + for (!is_consuming_inst(re.insts[threads[i].pc])) { + match (re.insts[threads[i].pc]) { + case inst_lit => abort(); + case inst_any => abort(); + case inst_split => + const new_pc = re.insts[threads[i].pc]: inst_split: size; + add_thread(threads, i, new_pc); + threads[i].pc += 1; + case inst_jump => + threads[i].pc = re.insts[threads[i].pc]: inst_jump: size; + case inst_skip => + const new_pc = threads[i].pc + 1; + threads[i].start_idx = str_idx: size; + add_thread(threads, i, new_pc); + break; + case inst_match => + // Do not match if we need an end-anchored match, but we + // have not exhausted our string + const mt = re.insts[threads[i].pc]: inst_match: match_type; + if (mt == match_type::ANCHORED && !(r_or_end is void)) { + threads[i].failed = true; + return; + }; + threads[i].root_group = matchgroup { + start = threads[i].start_idx, + end = str_idx: size, + // TODO: This is a perf issue for large strings + content = strings::sub(string, + threads[i].start_idx, + str_idx: size), + }; + threads[i].matched = true; + return newmatch; + case inst_groupstart => + if (threads[i].curr_group_inited) { + return "Found nested capture groups in expression, which are not supported": error; + }; + threads[i].curr_group.start = str_idx: size; + threads[i].curr_group_inited = true; + threads[i].pc += 1; + case inst_groupend => + if (!threads[i].curr_group_inited) { + return `Found a groupend token ")" without having previously seen a groupstart token "("`: error; + }; + threads[i].curr_group.end = str_idx: size; + // TODO: This is a perf issue for large strings + threads[i].curr_group.content = strings::sub(string, + threads[i].curr_group.start, + threads[i].curr_group.end); + append(threads[i].groups, threads[i].curr_group); + threads[i].curr_group = matchgroup { ... }; + threads[i].curr_group_inited = false; + threads[i].pc += 1; + case let ir: inst_repeat => + assert(ir.id < len(threads[i].rep_counters)); + threads[i].rep_counters[ir.id] += 1; + if (ir.max is size && + threads[i].rep_counters[ir.id] > + ir.max as size) { + threads[i].failed = true; + return; + }; + const new_pc = threads[i].pc + 1; + threads[i].pc = ir.origin; + if (ir.min is void || + threads[i].rep_counters[ir.id] >= + ir.min as size) { + add_thread(threads, i, new_pc); + }; + }; + }; + + // From now on, we're only matching consuming instructions, and these + // can't do anything without another rune. + if (r_or_end is void) { + threads[i].failed = true; + return; + }; + + const r = r_or_end as rune; + + match (re.insts[threads[i].pc]) { + case inst_skip => return; + case let lit: inst_lit => + if (r != lit) { + threads[i].failed = true; + }; + case inst_any => void; + case let cs: inst_charset => + const charset = re.charsets[cs.idx]; + // Disprove the match if we're looking for a negative match + // Prove the match if we're looking for a positive match + let matched = !cs.is_positive; + for (let i = 0z; i < len(charset); i += 1) match (charset[i]) { + case let lit: charset_lit_item => + if (r == lit) { + // Succeeded if positive match + // Failed if negative match + matched = cs.is_positive; + break; + }; + case let range: charset_range_item => + const r_enc = utf8::encoderune(r); + if (len(r_enc) > 1) { + return multibyte_err; + }; + const r_b = r_enc[0]; + if (r_b >= range.0 && r_b <= range.1) { + // Succeeded if positive match + // Failed if negative match + matched = cs.is_positive; + break; + }; + case let class: charset_class_item => + const n_cc = len(charclass_fns); + for (let cc_idx = 0z; cc_idx < n_cc; cc_idx += 1) { + const cc = charclass_fns[cc_idx]; + if (cc.0 == class: charclass && cc.1(r)) { + // Succeeded if positive match + // Failed if negative match + matched = cs.is_positive; + break; + }; + }; + }; + if (!matched) { + threads[i].failed = true; + }; + }; + + threads[i].pc += 1; +}; + +// Attempts to match a regular expression against a string and returns the +// either the longest leftmost match or all matches. +fn search( + re: regex, + string: str, + str_iter: *strings::iterator, + str_idx: *int +) (void | []matchgroup | error) = { + let threads: []thread = alloc([ + thread { groups = alloc([]), ... } + ]); + if (re.n_reps > 0) { + threads[0].rep_counters = alloc([0...], re.n_reps); + }; + defer { + for (let i = 0z; i < len(threads); i += 1) { + free(threads[i].groups); + free(threads[i].rep_counters); + }; + free(threads); + }; + + let first_match_idx: (void | size) = void; + + for (true) { + if (len(threads) == 0) { + return void; + }; + + let all_matched = true; + for (let i = 0z; i < len(threads); i += 1) { + if (!threads[i].matched) { + all_matched = false; + break; + }; + }; + + if (all_matched) { + let best_len = 0z; + let best_n_groups = 0z; + let best_idx = 0z; + for (let i = 0z; i < len(threads); i += 1) { + let match_len = threads[i].root_group.end - + threads[i].root_group.start; + const is_better = match_len > best_len || + match_len == best_len && + len(threads[i].groups) > best_n_groups; + if (is_better) { + best_len = match_len; + best_idx = i; + best_n_groups = len(threads[i].groups); + }; + }; + let res: []matchgroup = alloc([], + len(threads[best_idx].groups) + 1); + append(res, threads[best_idx].root_group); + append(res, threads[best_idx].groups...); + return res; + }; + + const r_or_end = strings::next(str_iter); + *str_idx += 1; + + for (let i = 0z; i < len(threads); i += 1) { + const res = run_thread(i, re, string, &threads, + r_or_end, *str_idx)?; + const matchlen = threads[i].root_group.end - + threads[i].root_group.start; + const is_better = res is newmatch && matchlen > 0 && + (first_match_idx is void || + threads[i].start_idx < + first_match_idx as size); + if (is_better) { + first_match_idx = threads[i].start_idx; + }; + }; + + // When we only want the leftmost match, delete all threads that + // start after the earliest non-zero-length matched thread + if (first_match_idx is size) { + for (let i = 0z; i < len(threads); i += 1) { + if (threads[i].start_idx > + first_match_idx as size) { + threads[i].failed = true; + }; + }; + }; + + // Delete threads that have a PC that has already been + // encountered in previous threads. Prioritise threads that + // have an earlier start_idx, and threads that were added + // earlier. + for (let i = 0i64; i < len(threads): i64 - 1; i += 1) { + for (let j = i + 1; j < len(threads): i64; j += 1) { + const same_pc = threads[i].pc == threads[j].pc; + const none_matched = !threads[j].matched && + !threads[i].matched; + if (same_pc && none_matched) { + if (threads[i].start_idx <= + threads[j].start_idx) { + delete_thread(j: size, &threads); + j -= 1; + } else { + delete_thread(i: size, &threads); + i -= 1; + break; + }; + }; + }; + }; + + for (let i = 0z; i < len(threads); i += 1) { + if (threads[i].failed) { + delete_thread(i, &threads); + i -= 1; + }; + }; + }; + + return void; +}; + + +// Attempts to match a regular expression against a string and returns the +// either the longest leftmost match or all matches. +export fn find(re: regex, string: str) (void | []matchgroup | error) = { + let str_idx = -1; + let str_iter = strings::iter(string); + return search(re, string, &str_iter, &str_idx); +}; + +// Attempts to match a regular expression against a string and returns all +// non-overlapping matches. +export fn findall(re: regex, string: str) (void | [][]matchgroup | error) = { + let res: [][]matchgroup = alloc([]); + let str_idx = -1; + let str_iter = strings::iter(string); + for (true) { + const findres = search(re, string, &str_iter, &str_idx)?; + match (findres) { + case let m: []matchgroup => + append(res, m); + assert(str_idx: size >= m[0].end); + for (str_idx: size > m[0].end) { + strings::prev(&str_iter); + str_idx -= 1; + }; + if (str_idx: size >= len(string)) { + break; + }; + case void => break; + }; + }; + if (len(res) == 0) { + return void; + }; + return res; +}; + +// Frees all the matches in a slice and the slice itself. +export fn freeall(s: [][]matchgroup) void = { + for (let i = 0z; i < len(s); i += 1) { + free(s[i]); + }; + free(s); +}; diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -1025,6 +1025,16 @@ path() { gen_ssa path strings bufio bytes io } +regex() { + if [ $testing -eq 0 ]; then + gen_srcs regex regex.ha + gen_ssa regex encoding::utf8 errors strconv strings + else + gen_srcs regex regex.ha +test.ha + gen_ssa regex encoding::utf8 errors strconv strings fmt io os + fi +} + gensrcs_strconv() { gen_srcs strconv \ types.ha \ @@ -1298,6 +1308,7 @@ net::uri os linux freebsd os::exec linux freebsd path +regex shlex slices sort diff --git a/stdlib.mk b/stdlib.mk @@ -548,6 +548,12 @@ stdlib_deps_any+=$(stdlib_path_any) stdlib_path_linux=$(stdlib_path_any) stdlib_path_freebsd=$(stdlib_path_any) +# gen_lib regex (any) +stdlib_regex_any=$(HARECACHE)/regex/regex-any.o +stdlib_deps_any+=$(stdlib_regex_any) +stdlib_regex_linux=$(stdlib_regex_any) +stdlib_regex_freebsd=$(stdlib_regex_any) + # gen_lib shlex (any) stdlib_shlex_any=$(HARECACHE)/shlex/shlex-any.o stdlib_deps_any+=$(stdlib_shlex_any) @@ -1612,6 +1618,16 @@ $(HARECACHE)/path/path-any.ssa: $(stdlib_path_any_srcs) $(stdlib_rt) $(stdlib_st @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Npath \ -t$(HARECACHE)/path/path.td $(stdlib_path_any_srcs) +# regex (+any) +stdlib_regex_any_srcs= \ + $(STDLIB)/regex/regex.ha + +$(HARECACHE)/regex/regex-any.ssa: $(stdlib_regex_any_srcs) $(stdlib_rt) $(stdlib_encoding_utf8_$(PLATFORM)) $(stdlib_errors_$(PLATFORM)) $(stdlib_strconv_$(PLATFORM)) $(stdlib_strings_$(PLATFORM)) + @printf 'HAREC \t$@\n' + @mkdir -p $(HARECACHE)/regex + @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nregex \ + -t$(HARECACHE)/regex/regex.td $(stdlib_regex_any_srcs) + # shlex (+any) stdlib_shlex_any_srcs= \ $(STDLIB)/shlex/split.ha @@ -2429,6 +2445,12 @@ testlib_deps_any+=$(testlib_path_any) testlib_path_linux=$(testlib_path_any) testlib_path_freebsd=$(testlib_path_any) +# gen_lib regex (any) +testlib_regex_any=$(TESTCACHE)/regex/regex-any.o +testlib_deps_any+=$(testlib_regex_any) +testlib_regex_linux=$(testlib_regex_any) +testlib_regex_freebsd=$(testlib_regex_any) + # gen_lib shlex (any) testlib_shlex_any=$(TESTCACHE)/shlex/shlex-any.o testlib_deps_any+=$(testlib_shlex_any) @@ -3533,6 +3555,17 @@ $(TESTCACHE)/path/path-any.ssa: $(testlib_path_any_srcs) $(testlib_rt) $(testlib @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Npath \ -t$(TESTCACHE)/path/path.td $(testlib_path_any_srcs) +# regex (+any) +testlib_regex_any_srcs= \ + $(STDLIB)/regex/regex.ha \ + $(STDLIB)/regex/+test.ha + +$(TESTCACHE)/regex/regex-any.ssa: $(testlib_regex_any_srcs) $(testlib_rt) $(testlib_encoding_utf8_$(PLATFORM)) $(testlib_errors_$(PLATFORM)) $(testlib_strconv_$(PLATFORM)) $(testlib_strings_$(PLATFORM)) $(testlib_fmt_$(PLATFORM)) $(testlib_io_$(PLATFORM)) $(testlib_os_$(PLATFORM)) + @printf 'HAREC \t$@\n' + @mkdir -p $(TESTCACHE)/regex + @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nregex \ + -t$(TESTCACHE)/regex/regex.td $(testlib_regex_any_srcs) + # shlex (+any) testlib_shlex_any_srcs= \ $(STDLIB)/shlex/split.ha \