hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 90f29aff6a24db536b39fcd4b1796eb336782596
parent 36c9c7c1dbd41dcdcf9448ea45d39339856acf5b
Author: Autumn! <autumnull@posteo.net>
Date:   Sun, 12 Feb 2023 21:09:47 +0000

strings: Add rindex, rbyteindex, rtokenize, rsplitn, rcut

Signed-off-by: Autumn! <autumnull@posteo.net>

Diffstat:
Mstrings/index.ha | 68++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mstrings/tokenize.ha | 82++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 145 insertions(+), 5 deletions(-)

diff --git a/strings/index.ha b/strings/index.ha @@ -17,6 +17,18 @@ export fn index(haystack: str, needle: (str | rune)) (size | void) = { }; }; +// Returns the index of the last occurance of 'needle' in the 'haystack', or +// void if not present. The index returned is the rune-wise index, not the +// byte-wise index. +export fn rindex(haystack: str, needle: (str | rune)) (size | void) = { + match (needle) { + case let r: rune => + return rindex_rune(haystack, r); + case let s: str => + return rindex_string(haystack, s); + }; +}; + fn index_rune(s: str, r: rune) (size | void) = { let iter = iter(s); for (let i = 0z; true; i += 1) { @@ -31,6 +43,20 @@ fn index_rune(s: str, r: rune) (size | void) = { }; }; +fn rindex_rune(s: str, r: rune) (size | void) = { + let iter = riter(s); + for (let i = len(s) - 1; true; i -= 1) { + match (next(&iter)) { + case let n: rune => + if (r == n) { + return i; + }; + case void => + break; + }; + }; +}; + fn index_string(s: str, needle: str) (size | void) = { let s_iter = iter(s); for (let i = 0z; true; i += 1) { @@ -55,6 +81,31 @@ fn index_string(s: str, needle: str) (size | void) = { }; }; +fn rindex_string(s: str, needle: str) (size | void) = { + let s_iter = riter(s); + for (let i = len(s); true; i -= 1) { + let rest_iter = s_iter; + let needle_iter = riter(needle); + for (true) { + const rest_rune = next(&rest_iter); + const needle_rune = next(&needle_iter); + if (rest_rune is void && !(needle_rune is void)) { + break; + }; + if (needle_rune is void) { + return i - len(needle); + }; + if ((rest_rune as rune) != (needle_rune as rune)) { + break; + }; + }; + if (next(&s_iter) is void) { + break; + }; + }; +}; + + @test fn index() void = { assert(index("hello world", 'w') as size == 6); assert(index("こんにちは", 'ち') as size == 3); @@ -67,6 +118,9 @@ fn index_string(s: str, needle: str) (size | void) = { assert(index("hello world!", "word") is void); assert(index("こんにちは", "ちは") as size == 3); assert(index("こんにちは", "きょうは") is void); + + assert(index("hello world!", "o") as size == 4); + assert(rindex("hello world!", "o") as size == 7); }; // Returns the byte-wise index of the first occurance of 'needle' in the @@ -80,6 +134,17 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = { }); }; +// Returns the byte-wise index of the last occurance of 'needle' in the +// 'haystack', or void if not present. +export fn rbyteindex(haystack: str, needle: (str | rune)) (size | void) = { + return bytes::rindex(toutf8(haystack), match (needle) { + case let s: str => + yield toutf8(s); + case let r: rune => + yield if (r: u32 <= 0x7f) r: u32: u8 else utf8::encoderune(r); + }); +}; + @test fn byteindex() void = { assert(byteindex("hello world", 'w') as size == 6); assert(byteindex("こんにちは", 'ち') as size == 9); @@ -92,4 +157,7 @@ export fn byteindex(haystack: str, needle: (str | rune)) (size | void) = { assert(byteindex("hello world!", "word") is void); assert(byteindex("こんにちは", "ちは") as size == 9); assert(byteindex("こんにちは", "きょうは") is void); + + assert(byteindex("またあったね", "た") as size == 3); + assert(rbyteindex("またあったね", "た") as size == 12); }; diff --git a/strings/tokenize.ha b/strings/tokenize.ha @@ -9,7 +9,8 @@ use types; // The state for a tokenizer. export type tokenizer = bytes::tokenizer; -// Returns a tokenizer which yields sub-strings tokenized by a delimiter. +// Returns a tokenizer which yields sub-strings tokenized by a delimiter, +// starting at the beginning of the string. // // let tok = strings::tokenize("hello, my name is drew", " "); // assert(strings::next_token(&tok) as str == "hello,"); @@ -21,6 +22,20 @@ export type tokenizer = bytes::tokenizer; export fn tokenize(s: str, delim: str) tokenizer = bytes::tokenize(toutf8(s), toutf8(delim)); +// Returns a tokenizer which yields sub-strings tokenized by a delimiter, +// starting at the end of the string and moving backwards with each call +// to [[next_token]]. +// +// let tok = strings::rtokenize("hello, my name is drew", " "); +// assert(strings::next_token(&tok) as str == "drew"); +// assert(strings::next_token(&tok) as str == "is"); +// assert(strings::next_token(&tok) as str == "name"); +// assert(strings::remaining_tokens(&tok) == "hello, my"); +// +// The caller must ensure that 'delimiter' is not an empty string. +export fn rtokenize(s: str, delim: str) tokenizer = + bytes::rtokenize(toutf8(s), toutf8(delim)); + // Returns the next string from a tokenizer, and advances the cursor. Returns // void if there are no tokens left. export fn next_token(s: *tokenizer) (str | void) = { @@ -67,12 +82,20 @@ export fn remaining_tokens(s: *tokenizer) str = { let tok = tokenize("", "foo"); assert(peek_token(&tok) is void); assert(next_token(&tok) is void); + + let tok = rtokenize("Hello, my name is drew", " "); + assert(next_token(&tok) as str == "drew"); + assert(next_token(&tok) as str == "is"); + assert(next_token(&tok) as str == "name"); + assert(remaining_tokens(&tok) == "Hello, my"); + assert(peek_token(&tok) as str == "my"); + assert(remaining_tokens(&tok) == "Hello, my"); }; -// Splits a string into tokens delimited by 'delim', returning a slice of up to -// N tokens. The caller must free this slice. The strings within the slice are -// borrowed from 'in', and needn't be freed - but should be [[dupall]]'d if they -// should outlive 'in'. +// Splits a string into tokens delimited by 'delim', starting at the beginning +// of the string, and returning a slice of up to N tokens. The caller must free +// this slice. The strings within the slice are borrowed from 'in', and needn't +// be freed - but should be [[dupall]]'d if they should outlive 'in'. // // The caller must ensure that 'delimiter' is not an empty string. export fn splitn(in: str, delim: str, n: size) []str = { @@ -94,6 +117,31 @@ export fn splitn(in: str, delim: str, n: size) []str = { return toks; }; +// Splits a string into tokens delimited by 'delim', starting at the end +// of the string, and returning a slice of up to N tokens. The caller must free +// this slice. The strings within the slice are borrowed from 'in', and needn't +// be freed - but should be [[dupall]]'d if they should outlive 'in'. +// +// The caller must ensure that 'delimiter' is not an empty string. +export fn rsplitn(in: str, delim: str, n: size) []str = { + let toks: []str = []; + let tok = rtokenize(in, delim); + for (let i = 0z; i < n - 1z; i += 1) { + match (next_token(&tok)) { + case let s: str => + insert(toks[0], s); + case void => + return toks; + }; + }; + match(peek_token(&tok)) { + case void => void; + case let s: str => + insert(toks[0], remaining_tokens(&tok)); + }; + return toks; +}; + // Splits a string into tokens delimited by 'delim'. The caller must free the // returned slice. The strings within the slice are borrowed from 'in', and // needn't be freed - but must be [[dupall]]'d if they should outlive 'in'. @@ -122,6 +170,13 @@ export fn split(in: str, delim: str) []str = splitn(in, delim, types::SIZE_MAX); for (let i = 0z; i < len(expected3); i += 1) { assert(expected3[i] == actual3[i]); }; + + const expected4 = ["Hello, my", "name", "is", "Drew"]; + const actual4 = rsplitn("Hello, my name is Drew", " ", 4z); + assert(len(expected4) == len(actual4)); + for (let i = 0z; i < len(expected4); i += 1) { + assert(expected4[i] == actual4[i]); + }; }; // Returns a string "cut" along the first instance of a delimiter, returning @@ -138,6 +193,20 @@ export fn cut(in: str, delim: str) (str, str) = { return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1)); }; +// Returns a string "cut" along the last instance of a delimiter, returning +// everything up to the delimiter, and everything after the delimiter, in a +// tuple. +// +// strings::rcut("hello=world=foobar", "=") // ("hello=world", "foobar") +// strings::rcut("hello world", "=") // ("hello world", "") +// +// The return value is borrowed from the 'in' parameter. The caller must ensure +// that 'delimiter' is not an empty string. +export fn rcut(in: str, delim: str) (str, str) = { + let c = bytes::rcut(toutf8(in), toutf8(delim)); + return (fromutf8_unsafe(c.0), fromutf8_unsafe(c.1)); +}; + @test fn cut() void = { const sample = cut("hello=world", "="); assert(sample.0 == "hello" && sample.1 == "world"); @@ -147,4 +216,7 @@ export fn cut(in: str, delim: str) (str, str) = { assert(sample.0 == "hello world" && sample.1 == ""); const sample = cut("", "="); assert(sample.0 == "" && sample.1 == ""); + + const sample = rcut("hello=world=foobar", "="); + assert(sample.0 == "hello=world" && sample.1 == "foobar"); };