hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit da2aa867b2ecfd62b8f7cdf5f24e703026369617
parent bc4828649ae99fe6586c61fa000598a7a7a8b53e
Author: Bor Grošelj Simić <bor.groseljsimic@telemach.net>
Date:   Sun, 12 Sep 2021 13:43:24 +0200

fnmatch: new module

Signed-off-by: Bor Grošelj Simić <bor.groseljsimic@telemach.net>

Diffstat:
Afnmatch/+test.ha | 193+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Afnmatch/fnmatch.ha | 343+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mscripts/gen-stdlib | 11+++++++++++
Mstdlib.mk | 29+++++++++++++++++++++++++++++
4 files changed, 576 insertions(+), 0 deletions(-)

diff --git a/fnmatch/+test.ha b/fnmatch/+test.ha @@ -0,0 +1,193 @@ +// TODO: remove this after the forward references fix +export type flags = enum uint { + PATHNAME = 1u << 0, + NOESCAPE = 1u << 1, + PERIOD = 1u << 2, +}; + +type testcase = (str, str, bool, []flags); + +const testcases: [_]testcase = [ + // homegrown tests + ("a", "a", true, []), + ("b", "b", true, []), + ("ε", "ε", true, []), + ("\0", "\0", true, []), + ("abcde", "abcde", true, []), + ("aaa", "bbb", false, []), + ("わたし", "わたし", true, []), + ("わした", "わたし", false, []), + ("わaし", "わたし", false, []), + + ("ab*cde", "abcde", true, []), + ("g*a", "gordana", true, []), + ("ab*cde*foo*bar", "abcdefooba", false, []), + ("*", "foo", true, []), + ("aa*", "aafoo", true, []), + ("bb*", "foo", false, []), + ("*cc", "foocc", true, []), + ("*dd", "foo", false, []), + ("ra**ra", "rarara", true, []), + ("x*yy*x", "xxyyyyyxxx", true, []), + ("*", "*", true, []), + ("*", "", true, []), + ("****", "a", true, []), + ("**a**", "a", true, []), + ("****", "", true, []), + ("*", "わたし", true, []), + + ("?", "ε", true, []), + ("?", "\0", true, []), + ("?", "*", true, []), + ("?", "", false, []), + ("??", "a", false, []), + ("??", "abc", false, []), + ("?aa", "bbb", false, []), + ("わ?し", "わたし", true, []), + ("???", "わたし", true, []), + + ("**?**", "", false, []), + ("*?*?*?*?", "abcd", true, []), + ("*?*?*?*?", "abc", false, []), + + ("[b]", "b", true, []), + ("a[b]c", "abc", true, []), + ("a[b]c", "axc", false, []), + ("[a-c]", "b", true, []), + ("[a-z]", "a", true, []), + ("[c-a]", "b", false, []), + ("x[a-c]y", "xay", true, []), + ("x[a-c]y", "xby", true, []), + ("x[a-c]y", "xcy", true, []), + ("x[a-c]y", "xzy", false, []), + ("x[a-c]y", "xy", false, []), + ("[a-c]*[a-c]", "axxxb", true, []), + ("わ[た]し", "わたし", true, []), + + ("[-]", "-", true, []), + ("[.]", ".", true, []), + ("[:ias]", ":", true, []), + ("[-]", "a", false, []), + ("[-ac]", "a", true, []), + ("[-ac]", "-", true, []), + ("[-ac]", "b", false, []), + ("[ac-]", "a", true, []), + ("[ac-]", "-", true, []), + ("[ac-]", "b", false, []), + + ("[.a.]", "a", false, []), + + ("[[:alnum:]]", "7", true, []), + ("[[alpha:]]", "a]", true, []), + ("[[alpha:]]", ":]", true, []), + ("[[:alpha:]]", "a", true, []), + ("[[:blank:]]", " ", true, []), + ("[[:alnum:]]a", "a]a", false, []), + ("[[:alnum:][[:digit:]]", "a", true, []), + + ("[!b]", "b", false, []), + ("a[!b]c", "abc", false, []), + ("a[!b]c", "axc", true, []), + ("[!a-c]", "b", false, []), + ("[!c-a]", "b", true, []), + ("x[!a-c]y", "xay", false, []), + ("x[!a-c]y", "xby", false, []), + ("x[!a-c]y", "xcy", false, []), + ("x[!a-c]y", "xzy", true, []), + ("x[!a-c]y", "xy", false, []), + ("[!a-c]*[!a-c]", "axxxb", false, []), + ("わ[!た]し", "わたし", false, []), + + ("[!-]", "-", false, []), + ("[!-]", "a", true, []), + ("[!-ac]", "a", false, []), + ("[!-ac]", "-", false, []), + ("[!-ac]", "b", true, []), + + ("[![:alnum:]]", "a", false, []), + ("[![:alnum:]]a", "a]a", false, []), + ("[![:alnum:][:digit:]]", "a", false, []), + + (".", ".", true, [flags::PERIOD]), + ("*", ".", false, [flags::PERIOD]), + ("?", ".", false, [flags::PERIOD]), + ("[.]", ".", false, [flags::PERIOD]), + (".*", ".asdf", true, [flags::PERIOD]), + (".*", "asdf", false, [flags::PERIOD]), + + ("\\", "\\", true, [flags::NOESCAPE]), + ("\\*", "\\asdf", true, [flags::NOESCAPE]), + + // adapted from musl tests + ("*.c", "foo.c", true, []), + ("*.c", ".c", true, []), + ("*.a", "foo.c", false, []), + ("*.c", ".foo.c", true, []), + ("a\\*.c", "ax.c", false, []), + ("a[xy].c", "ax.c", true, []), + ("a[!y].c", "ax.c", true, []), + ("-O[01]", "-O1", true, []), + ("[[?*\\]", "\\", true, []), + ("[]?*\\]", "]", true, []), + ("[!]a-]", "b", true, []), + ("[]-_]", "^", true, []), + ("[!]-_]", "X", true, []), + ("??", "-", false, []), + ("[*]/b", "a/b", false, []), + ("[*]/b", "*/b", true, []), + ("[?]/b", "a/b", false, []), + ("[?]/b", "?/b", true, []), + ("[[a]/b", "a/b", true, []), + ("[[a]/b", "[/b", true, []), + ("\\*/b", "a/b", false, []), + ("\\*/b", "*/b", true, []), + ("\\?/b", "a/b", false, []), + ("\\?/b", "?/b", true, []), + ("[/b", "[/b", false, []), + ("\\[/b", "[/b", true, []), + ("??""/b", "aa/b", true, []), + ("???b", "aa/b", true, []), + ("a[/]b", "a/b", true, []), + ("*[/]b", "a", false, []), + ("[![:d-d]", "b", false, []), + ("[[:d-d]", "[", false, []), + ("[![:d-d]", "[", false, []), + ("[a-z]/[a-z]", "a/b", true, [flags::PATHNAME]), + ("a[/]b", "a/b", false, [flags::PATHNAME]), + ("*", "a/b", false, [flags::PATHNAME]), + ("*[/]b", "a/b", false, [flags::PATHNAME]), + ("*[b]", "a/b", false, [flags::PATHNAME]), + ("a[a/z]*.c", "a/x.c", false, [flags::PATHNAME]), + ("a/*.c", "a/x.c", true, [flags::PATHNAME]), + ("a*.c", "a/x.c", false, [flags::PATHNAME]), + ("*/foo", "/foo", true, [flags::PATHNAME]), + ("*.c", ".foo.c", false, [flags::PERIOD]), + ("*.c", "foo.c", true, [flags::PERIOD]), + ("a\\*.c", "a*.c", false, [flags::NOESCAPE]), + ("???b", "aa/b", false, [flags::PATHNAME]), + ("?a/b", ".a/b", false, [flags::PATHNAME, flags::PERIOD]), + ("a/?b", "a/.b", false, [flags::PATHNAME, flags::PERIOD]), + ("*a/b", ".a/b", false, [flags::PATHNAME, flags::PERIOD]), + ("a/*b", "a/.b", false, [flags::PATHNAME, flags::PERIOD]), + ("[.]a/b", ".a/b", false, [flags::PATHNAME, flags::PERIOD]), + ("a/[.]b", "a/.b", false, [flags::PATHNAME, flags::PERIOD]), + ("*/?", "a/b", true, [flags::PATHNAME, flags::PERIOD]), + ("?/*", "a/b", true, [flags::PATHNAME, flags::PERIOD]), + (".*/?", ".a/b", true, [flags::PATHNAME, flags::PERIOD]), + ("*/.?", "a/.b", true, [flags::PATHNAME, flags::PERIOD]), + ("*/*", "a/.b", false, [flags::PATHNAME, flags::PERIOD]), + ("*?*/*", "a/.b", true, [flags::PERIOD]), + ("*[.]/b", "a./b", true, [flags::PATHNAME, flags::PERIOD]), + ("*[[:alpha:]]/*[[:alnum:]]", "a/b", true, [flags::PATHNAME]), + ("a?b", "a.b", true, [flags::PATHNAME, flags::PERIOD]), + ("a*b", "a.b", true, [flags::PATHNAME, flags::PERIOD]), + ("a[.]b", "a.b", true, [flags::PATHNAME, flags::PERIOD]), +]; + +@test fn fnmatch() void = { + for (let i = 0z; i < len(testcases); i += 1) { + let tc = testcases[i]; + assert(fnmatch(tc.0, tc.1, tc.3...) == tc.2); + }; + +}; diff --git a/fnmatch/fnmatch.ha b/fnmatch/fnmatch.ha @@ -0,0 +1,343 @@ +use ascii; +use errors; +use strings; +use sort; + +// A set of flags that alter the matching behavior of [[fnmatch::fnmatch]] +export type flags = enum uint { + // If this flag is set, slashes in the string will only be matched by + // literal slashes in the pattern + PATHNAME = 1u << 0, + // If this flag is set, backslash will be treated as an ordinary + // character + NOESCAPE = 1u << 1, + // If this flag is set, a '.' at the beginning of the string can only + // be matched by a literal '.' in the pattern. If + // [[fnmatch::flags::PATHNAME]] is set simultaneously, this behavior also + // apply to any periods immediately following a slash. + PERIOD = 1u << 2, +}; + +type bracket = void; +type star = void; +type question = void; +type end = void; +type token = (rune | bracket | star | question | end); + +// Check whether the [[string]] matches the [[pattern]] which is a shell +// wildcard pattern with the following matching rules: +// - '?' matches any single character +// - '*' matches any string, including the empty string +// - '[' and ']' enclose a bracket expression. Matching rules for bracket +// expressions are identical to those of bracket subexpressions in regular +// expressions, except that '!' takes the role of '^' when placed right after +// the opening '['. +// - '\' escapes the following character, e. g. "\*" only matches literal '*' +// and has no special meaning +// - all other characters only match themselves +// +// A set of flags that alter the matching behavior may be passed to +// [[fnmatch::fnmatch]]. For an explanation of their meaning, see [ +// [fnmatch::flags]]. +export fn fnmatch(pattern: str, string: str, flag: flags...) bool = { + let fl: flags = 0; + for (let i = 0z; i < len(flag); i += 1) { + fl |= flag[i]; + }; + if (fl & flags::PATHNAME != 0) { + return match (fnmatch_pathname(pattern, string, fl)) { + b: bool => b, + * => false, + }; + } else { + return match (fnmatch_internal(pattern, string, fl)) { + b: bool => b, + * => false, + }; + }; +}; + +// Split the pattern and the string on every '/' and process each part +// separately +fn fnmatch_pathname( + pattern: str, + string: str, + fl: flags, +) (bool | errors::unsupported | errors::invalid) = { + let tok = strings::tokenize(string, "/"); + let p_iter = strings::iter(pattern); + let start = p_iter; + for (true) :outer { + start = p_iter; + for (true) match (pat_next(&p_iter, fl)?) { + end => break :outer, + r: rune => if (r == '/') break, + bracket => match_bracket(&p_iter, '\0')?, + (question | star) => void, + }; + let s = match (strings::next_token(&tok)) { + void => return false, + s: str => s, + }; + strings::prev(&p_iter); + let p = cut_tail(strings::iter_str(&start), &p_iter); + strings::next(&p_iter); + if (!fnmatch_internal(p, s, fl)?) { + return false; + }; + }; + let s = match(strings::next_token(&tok)) { + void => return false, + s: str => s, + }; + let p = strings::iter_str(&start); + return fnmatch_internal(p, s, fl)? && strings::next_token(&tok) is void; +}; + +// Core fnmatch function, implementing the "Sea of stars" algorithm that is also +// used in Musl libc. First we make sure the parts before the first star and +// after the last star produce exact matches and then proceed to greedily match +// everything in between. Because of the greedy property this algorithm does not +// have exponential corner cases. +export fn fnmatch_internal( + pattern: str, + string: str, + fl: flags, +) (bool | errors::invalid | errors::unsupported) = { + if (fl & flags::PERIOD != 0) { + if (strings::has_prefix(string, ".") + && !strings::has_prefix(pattern, ".")) { + return false; + }; + }; + + let p = strings::iter(pattern); + let s = strings::iter(string); + + // match up to the first * + for (true) { + let copy = s; + let rn = strings::next(&copy); + let t = match (pat_next(&p, fl)?) { + star => break, + end => return rn is void, + question => rn is rune, + bracket => rn is rune && match_bracket(&p, rn: rune)?, + r: rune => rn is rune && rn: rune == r, + }; + if (!t) { + return false; + }; + s = copy; + }; + + // find the tail of the pattern + let p_copy = p, p_last = (p, 0z); + let cnt = 0z; + for (true; cnt += 1) { + match (pat_next(&p, fl)?) { + end => break, + star => p_last = (p, cnt + 1), + bracket => match_bracket(&p, '\0')?, + (question | rune) => void, + }; + }; + p = p_last.0; + cnt = cnt - p_last.1; + let s_copy = s; + s = strings::riter(string); + for (let i = 0z; i < cnt; i += 1) { + strings::prev(&s); + }; + + // match the tail + let s_last = s; + for (true) { + let rn = strings::next(&s); + let matches = match (pat_next(&p, fl)?) { + end => if (rn is void) break else return false, + question => rn is rune, + bracket => rn is rune && match_bracket(&p, rn: rune)?, + r: rune => rn is rune && rn: rune == r, + star => abort(), + }; + if (!matches) { + return false; + }; + }; + + // match the "sea of stars" in the middle + s_copy = strings::iter(cut_tail(strings::iter_str(&s_copy), &s_last)); + p_copy = strings::iter(cut_tail(strings::iter_str(&p_copy), &p_last.0)); + for (true) :outer { + p = p_copy; + if (len(strings::iter_str(&p)) == 0) { + return true; + }; + s = s_copy; + for (true) :inner { + let copy = s; + let rn = strings::next(&copy); + let matched = match (pat_next(&p, fl)?) { + end => abort(), + question => rn is rune, + bracket => rn is rune && match_bracket(&p, rn: rune)?, + r: rune => rn is rune && r == rn: rune, + star => { + p_copy = p; + s_copy = s; + continue :outer; + }, + }; + if (!matched) { + break :inner; + }; + s = copy; + }; + match (strings::next(&s_copy)) { + void => return false, + rune => void, + }; + }; + abort(); +}; + +fn match_bracket( + it: *strings::iterator, + c: rune, +) (bool | errors::invalid | errors::unsupported) = { + let old = *it; + let first = advance_or_err(it)?; + let inv = false; + if (first == '^') { + return errors::invalid; + }; + if (first == '!') { + inv = true; + first = advance_or_err(it)?; + }; + let found = (first == c); + let last: (rune | void) = first; + if (first == ']') { + first = advance_or_err(it)?; + }; + for (let r = first; true; r = advance_or_err(it)?) { + switch (r) { + ']' => break, + '-' => { + let end = advance_or_err(it)?; + if (end == ']') { + // '-' at the end matches itself + strings::push(it, ']'); + last = '-'; + found ||= (c == '-'); + continue; + }; + if (last is void) { + return errors::invalid; + }; + let l = last: rune; + found ||= (l: u32 <= c: u32 && c: u32 <= end: u32); + last = void; // forbid 'a-f-n' + }, + '[' => { + let next_rune = advance_or_err(it)?; + switch (next_rune) { // TODO localization + '=', '.' => return errors::unsupported, + ':' => { + let t = match_ctype(it, c)?; + found ||= t; + }, + * => { + strings::push(it, next_rune); + found ||= (c == '['); + }, + }; + last = '['; + }, + * => { + found ||= (c == r); + last = r; + }, + }; + }; + + let cnt = len(strings::iter_str(&old)) - len(strings::iter_str(it)); + if (last is rune && first == last: rune && cnt >= 4) { + switch (first) { + '=', '.', ':' => return errors::invalid, + * => void, + }; + }; + return found ^^ inv; +}; + +fn match_ctype(it: *strings::iterator, c: rune) (bool | errors::invalid) = { + let s = strings::iter_str(it); + let i = 0z; + for (let r = '\0'; r != ':'; i+= 1) { + r = advance_or_err(it)?; + if (!ascii::isascii(r)) { + return errors::invalid; + }; + }; + if (advance_or_err(it)? != ']') { + return errors::invalid; + }; + let name = strings::sub(s, 0, i - 1); + return match (ctype_name_to_func(name)) { + null => return errors::invalid, + f: *fn(c: rune) bool => f(c), + }; +}; + +type funcmap = (str, *fn(c: rune) bool); + +fn cmp(a: const *void, b: const *void) int = { + return ascii::strcmp((a: *funcmap).0, *(b: const *str)): int; +}; + +fn ctype_name_to_func(name: str) nullable *fn(c: rune) bool = { + const map: [_]funcmap = [ + ("alnum", &ascii::isalnum), ("alpha", &ascii::isalpha), + ("blank", &ascii::isblank), ("cntrl", &ascii::iscntrl), + ("digit", &ascii::isdigit), ("graph", &ascii::isgraph), + ("lower", &ascii::islower), ("print", &ascii::isprint), + ("punct", &ascii::ispunct), ("space", &ascii::isspace), + ("upper", &ascii::isupper), ("xdigit",&ascii::isxdigit), + ]; + return match (sort::search(map, size(funcmap), &name, &cmp)) { + null => null: nullable *fn(c: rune) bool, + p: *void => (p: *funcmap).1, + }; +}; + +fn pat_next(pat: *strings::iterator, fl: flags) (token | errors::invalid) = { + let r = match (strings::next(pat)) { + void => return end, + r: rune => r, + }; + return switch (r) { + '*' => star, + '?' => question, + '[' => bracket, + // TODO: remove ? (harec bug workaround) + '\\' => if (fl & flags::NOESCAPE == 0) advance_or_err(pat)? + else '\\': token, // TODO: remove cast (harec bug workaround) + * => r, + }; +}; + +fn advance_or_err(it: *strings::iterator) (rune | errors::invalid) = { + return match (strings::next(it)) { + r: rune => r, + void => errors::invalid, + }; +}; + +fn cut_tail(s: str, it: *strings::iterator) str = { + let s_len = len(s), t_len = len(strings::iter_str(it)); + let b = strings::toutf8(s); + return strings::fromutf8(b[..s_len - t_len]); +}; + diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib @@ -290,6 +290,16 @@ fmt() { gen_ssa fmt bufio io os strconv strings types } +fnmatch() { + if [ $testing -eq 0 ] + then + gen_srcs fnmatch fnmatch.ha + else + gen_srcs fnmatch fnmatch.ha +test.ha + fi + gen_ssa fnmatch strings bytes sort ascii io fmt +} + format_elf() { gen_srcs format::elf \ '$(ARCH).ha' \ @@ -868,6 +878,7 @@ encoding::utf8 endian errors fmt +fnmatch format::elf format::xml fs diff --git a/stdlib.mk b/stdlib.mk @@ -162,6 +162,10 @@ hare_stdlib_deps+=$(stdlib_format_elf) stdlib_format_xml=$(HARECACHE)/format/xml/format_xml.o hare_stdlib_deps+=$(stdlib_format_xml) +# gen_lib fnmatch +stdlib_fnmatch=$(HARECACHE)/fnmatch/fnmatch.o +hare_stdlib_deps+=$(stdlib_fnmatch) + # gen_lib fs stdlib_fs=$(HARECACHE)/fs/fs.o hare_stdlib_deps+=$(stdlib_fs) @@ -591,6 +595,16 @@ $(HARECACHE)/format/xml/format_xml.ssa: $(stdlib_format_xml_srcs) $(stdlib_rt) $ @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nformat::xml \ -t$(HARECACHE)/format/xml/format_xml.td $(stdlib_format_xml_srcs) +# fnmatch +stdlib_fnmatch_srcs= \ + $(STDLIB)/fnmatch/fnmatch.ha + +$(HARECACHE)/fnmatch/fnmatch.ssa: $(stdlib_fnmatch_srcs) $(stdlib_rt) $(stdlib_strings) $(stdlib_bytes) $(stdlib_sort) $(stdlib_ascii) $(stdlib_io) $(stdlib_fmt) + @printf 'HAREC \t$@\n' + @mkdir -p $(HARECACHE)/fnmatch + @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nfnmatch \ + -t$(HARECACHE)/fnmatch/fnmatch.td $(stdlib_fnmatch_srcs) + # fs stdlib_fs_srcs= \ $(STDLIB)/fs/types.ha \ @@ -1375,6 +1389,10 @@ hare_testlib_deps+=$(testlib_format_elf) testlib_format_xml=$(TESTCACHE)/format/xml/format_xml.o hare_testlib_deps+=$(testlib_format_xml) +# gen_lib fnmatch +testlib_fnmatch=$(TESTCACHE)/fnmatch/fnmatch.o +hare_testlib_deps+=$(testlib_fnmatch) + # gen_lib fs testlib_fs=$(TESTCACHE)/fs/fs.o hare_testlib_deps+=$(testlib_fs) @@ -1812,6 +1830,17 @@ $(TESTCACHE)/format/xml/format_xml.ssa: $(testlib_format_xml_srcs) $(testlib_rt) @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nformat::xml \ -t$(TESTCACHE)/format/xml/format_xml.td $(testlib_format_xml_srcs) +# fnmatch +testlib_fnmatch_srcs= \ + $(STDLIB)/fnmatch/fnmatch.ha \ + $(STDLIB)/fnmatch/+test.ha + +$(TESTCACHE)/fnmatch/fnmatch.ssa: $(testlib_fnmatch_srcs) $(testlib_rt) $(testlib_strings) $(testlib_bytes) $(testlib_sort) $(testlib_ascii) $(testlib_io) $(testlib_fmt) + @printf 'HAREC \t$@\n' + @mkdir -p $(TESTCACHE)/fnmatch + @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nfnmatch \ + -t$(TESTCACHE)/fnmatch/fnmatch.td $(testlib_fnmatch_srcs) + # fs testlib_fs_srcs= \ $(STDLIB)/fs/types.ha \