hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 36c9c7c1dbd41dcdcf9448ea45d39339856acf5b
parent 52b3f2d0c7a85e04a79666a954101e527b7f1272
Author: Autumn! <autumnull@posteo.net>
Date:   Sun, 12 Feb 2023 21:09:46 +0000

bytes: Add rtokenize and rcut

Signed-off-by: Autumn! <autumnull@posteo.net>

Diffstat:
Mbytes/tokenize.ha | 127+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 109 insertions(+), 18 deletions(-)

diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha @@ -5,13 +5,14 @@ use types; export type tokenizer = struct { - s: []u8, - d: []u8, - p: size, + s: []u8, // string being tokenized + d: []u8, // delimiter + p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones. }; -// Returns a tokenizer which yields sub-slices tokenized by a delimiter. The -// caller must ensure that 'delimiter' is not an empty slice. +// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting +// at the beginning of the slice. The caller must ensure that 'delimiter' is not +// an empty slice. Can tokenize a slice of length less than [[types::I64_MAX]]. export fn tokenize(s: []u8, delim: []u8) tokenizer = { assert(len(delim) > 0, "bytes::tokenize called with empty slice"); if (len(s) == 0) { @@ -20,7 +21,25 @@ export fn tokenize(s: []u8, delim: []u8) tokenizer = { return tokenizer { s = s, d = delim, - p = types::SIZE_MAX, + p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet. + }; +}; + +// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at +// the end of the slice and moving backwards with each call to [[next_token]]. The +// caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice +// of length less than [[types::I64_MAX]]. +export fn rtokenize(s: []u8, delim: []u8) tokenizer = { + assert(len(delim) > 0, "bytes::rtokenize called with empty slice"); + if (len(s) == 0) { + delim = []; + }; + return tokenizer { + s = s, + d = delim, + p = types::I64_MIN, // I64_MIN means we haven't peeked the next token yet. + // also note that p == -1 corresponds to an index of len(s), + // and p == -(1 - len(s)) corresponds to an index of 0. }; }; @@ -30,13 +49,24 @@ export fn tokenize(s: []u8, delim: []u8) tokenizer = { // beginning or end of the sequence, respectively. export fn next_token(s: *tokenizer) ([]u8 | void) = match (peek_token(s)) { case let b: []u8 => - if (s.p == len(s.s)) { - s.d = s.d[..0]; - s.s = s.s[..0]; + if (s.p < 0) { // reverse + if (len(s.s): i64 + s.p + 1 == 0) { + s.d = s.d[..0]; + s.s = s.s[..0]; + } else { + const end = (len(s.s): i64 + s.p + 1): size - len(s.d); + s.s = s.s[..end]; + }; + s.p = types::I64_MIN; } else { - s.s = s.s[s.p + len(s.d)..]; + if (s.p == len(s.s): i64) { + s.d = s.d[..0]; + s.s = s.s[..0]; + } else { + s.s = s.s[s.p: size + len(s.d)..]; + }; + s.p = types::I64_MAX; }; - s.p = types::SIZE_MAX; return b; case => void; }; @@ -46,15 +76,27 @@ export fn peek_token(s: *tokenizer) ([]u8 | void) = { if (len(s.d) == 0) { return; }; - if (s.p > len(s.s)) { - s.p = match (index(s.s, s.d)) { - case let i: size => - yield i; - case void => - yield len(s.s); + if (s.p < 0) { // reverse + if (s.p == types::I64_MIN) { + s.p = match (rindex(s.s, s.d)) { + case let i: size => + yield (i + len(s.d)): i64 - len(s.s): i64 - 1; + case void => + yield -(len(s.s): i64 + 1); + }; }; + return s.s[len(s.s) + s.p: size + 1..]; + } else { + if (s.p == types::I64_MAX) { + s.p = match (index(s.s, s.d)) { + case let i: size => + yield i: i64; + case void => + yield len(s.s): i64; + }; + }; + return s.s[..s.p: size]; }; - return s.s[..s.p]; }; @@ -116,6 +158,29 @@ export fn remaining_tokens(s: *tokenizer) []u8 = { t = tokenize([]: []u8, [42]); assert(peek_token(&t) is void); assert(next_token(&t) is void); + + const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5]; + let t = rtokenize(input, [24, 42]); + let p = peek_token(&t) as []u8; + let n = next_token(&t) as []u8; + assert(equal(p, n)); + assert(equal([4, 5], n)); + let p = peek_token(&t) as []u8; + let n = next_token(&t) as []u8; + assert(equal(p, n)); + assert(equal([3, 24], n)); + assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); + assert(equal([1, 2], next_token(&t) as []u8)); + assert(peek_token(&t) is void); + assert(next_token(&t) is void); + + const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42]; + t = rtokenize(input, [24, 42]); + assert(equal([], next_token(&t) as []u8)); + assert(equal([4], next_token(&t) as []u8)); + assert(equal(remaining_tokens(&t), [1, 2, 3])); + assert(equal(peek_token(&t) as []u8, [1, 2, 3])); + assert(equal(remaining_tokens(&t), [1, 2, 3])); }; // Returns the input slice "cut" along the first instance of a delimiter, @@ -139,6 +204,27 @@ export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { }; }; +// Returns the input slice "cut" along the last instance of a delimiter, +// returning everything up to the delimiter, and everything after the delimiter, +// in a tuple. The contents are borrowed from the input slice. +// +// The caller must ensure that 'delimiter' is not an empty slice. +export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { + let ln = if (delim is u8) { + yield 1z; + } else { + let ln = len(delim: []u8); + assert(ln > 0, "bytes::rcut called with empty delimiter"); + yield ln; + }; + match (rindex(in, delim)) { + case let i: size => + return (in[..i], in[i + ln..]); + case void => + return (in, []); + }; +}; + @test fn cut() void = { const c = cut(['a', 'b', 'c'], ['b']); assert(equal(c.0, ['a']) && equal(c.1, ['c'])); @@ -150,4 +236,9 @@ export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, [])); const c = cut([], 'x'); assert(equal(c.0, []) && equal(c.1, [])); + + const c = rcut(['a', 'b', 'c'], ['b']); + assert(equal(c.0, ['a']) && equal(c.1, ['c'])); + const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b'); + assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a'])); };