bytes: Add rtokenize and rcut - hare - [hare] The Hare programming language

commit 36c9c7c1dbd41dcdcf9448ea45d39339856acf5b
parent 52b3f2d0c7a85e04a79666a954101e527b7f1272
Author: Autumn! <autumnull@posteo.net>
Date:   Sun, 12 Feb 2023 21:09:46 +0000

bytes: Add rtokenize and rcut

Signed-off-by: Autumn! <autumnull@posteo.net>

Diffstat:
M bytes/tokenize.ha  | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------

1 file changed, 109 insertions(+), 18 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -5,13 +5,14 @@
 use types;
 
 export type tokenizer = struct {
-	s: []u8,
-	d: []u8,
-	p: size,
+	s: []u8, // string being tokenized
+	d: []u8, // delimiter
+	p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones.
 };
 
-// Returns a tokenizer which yields sub-slices tokenized by a delimiter. The
-// caller must ensure that 'delimiter' is not an empty slice.
+// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting
+// at the beginning of the slice. The caller must ensure that 'delimiter' is not
+// an empty slice. Can tokenize a slice of length less than [[types::I64_MAX]].
 export fn tokenize(s: []u8, delim: []u8) tokenizer = {
 	assert(len(delim) > 0, "bytes::tokenize called with empty slice");
 	if (len(s) == 0) {
@@ -20,7 +21,25 @@ export fn tokenize(s: []u8, delim: []u8) tokenizer = {
 	return tokenizer {
 		s = s,
 		d = delim,
-		p = types::SIZE_MAX,
+		p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet.
+	};
+};
+
+// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at
+// the end of the slice and moving backwards with each call to [[next_token]]. The
+// caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice
+// of length less than [[types::I64_MAX]].
+export fn rtokenize(s: []u8, delim: []u8) tokenizer = {
+	assert(len(delim) > 0, "bytes::rtokenize called with empty slice");
+	if (len(s) == 0) {
+		delim = [];
+	};
+	return tokenizer {
+		s = s,
+		d = delim,
+		p = types::I64_MIN, // I64_MIN means we haven't peeked the next token yet.
+		// also note that p == -1 corresponds to an index of len(s),
+		// and p == -(1 - len(s)) corresponds to an index of 0.
 	};
 };
 
@@ -30,13 +49,24 @@ export fn tokenize(s: []u8, delim: []u8) tokenizer = {
 // beginning or end of the sequence, respectively.
 export fn next_token(s: *tokenizer) ([]u8 | void) = match (peek_token(s)) {
 case let b: []u8 =>
-	if (s.p == len(s.s)) {
-		s.d = s.d[..0];
-		s.s = s.s[..0];
+	if (s.p < 0) { // reverse
+		if (len(s.s): i64 + s.p + 1 == 0) {
+			s.d = s.d[..0];
+			s.s = s.s[..0];
+		} else {
+			const end = (len(s.s): i64 + s.p + 1): size - len(s.d);
+			s.s = s.s[..end];
+		};
+		s.p = types::I64_MIN;
 	} else {
-		s.s = s.s[s.p + len(s.d)..];
+		if (s.p == len(s.s): i64) {
+			s.d = s.d[..0];
+			s.s = s.s[..0];
+		} else {
+			s.s = s.s[s.p: size + len(s.d)..];
+		};
+		s.p = types::I64_MAX;
 	};
-	s.p = types::SIZE_MAX;
 	return b;
 case => void;
 };
@@ -46,15 +76,27 @@ export fn peek_token(s: *tokenizer) ([]u8 | void) = {
 	if (len(s.d) == 0) {
 		return;
 	};
-	if (s.p > len(s.s)) {
-		s.p = match (index(s.s, s.d)) {
-		case let i: size =>
-			yield i;
-		case void =>
-			yield len(s.s);
+	if (s.p < 0) { // reverse
+		if (s.p == types::I64_MIN) {
+			s.p = match (rindex(s.s, s.d)) {
+			case let i: size =>
+				yield (i + len(s.d)): i64 - len(s.s): i64 - 1;
+			case void =>
+				yield -(len(s.s): i64 + 1);
+			};
 		};
+		return s.s[len(s.s) + s.p: size + 1..];
+	} else {
+		if (s.p == types::I64_MAX) {
+			s.p = match (index(s.s, s.d)) {
+			case let i: size =>
+				yield i: i64;
+			case void =>
+				yield len(s.s): i64;
+			};
+		};
+		return s.s[..s.p: size];
 	};
-	return s.s[..s.p];
 };
 
 
@@ -116,6 +158,29 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
 	t = tokenize([]: []u8, [42]);
 	assert(peek_token(&t) is void);
 	assert(next_token(&t) is void);
+
+	const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
+	let t = rtokenize(input, [24, 42]);
+	let p = peek_token(&t) as []u8;
+	let n = next_token(&t) as []u8;
+	assert(equal(p, n));
+	assert(equal([4, 5], n));
+	let p = peek_token(&t) as []u8;
+	let n = next_token(&t) as []u8;
+	assert(equal(p, n));
+	assert(equal([3, 24], n));
+	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
+	assert(equal([1, 2], next_token(&t) as []u8));
+	assert(peek_token(&t) is void);
+	assert(next_token(&t) is void);
+
+	const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42];
+	t = rtokenize(input, [24, 42]);
+	assert(equal([], next_token(&t) as []u8));
+	assert(equal([4], next_token(&t) as []u8));
+	assert(equal(remaining_tokens(&t), [1, 2, 3]));
+	assert(equal(peek_token(&t) as []u8, [1, 2, 3]));
+	assert(equal(remaining_tokens(&t), [1, 2, 3]));
 };
 
 // Returns the input slice "cut" along the first instance of a delimiter,
@@ -139,6 +204,27 @@ export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
 	};
 };
 
+// Returns the input slice "cut" along the last instance of a delimiter,
+// returning everything up to the delimiter, and everything after the delimiter,
+// in a tuple. The contents are borrowed from the input slice.
+//
+// The caller must ensure that 'delimiter' is not an empty slice.
+export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
+	let ln = if (delim is u8) {
+		yield 1z;
+	} else {
+		let ln = len(delim: []u8);
+		assert(ln > 0, "bytes::rcut called with empty delimiter");
+		yield ln;
+	};
+	match (rindex(in, delim)) {
+	case let i: size =>
+		return (in[..i], in[i + ln..]);
+	case void =>
+		return (in, []);
+	};
+};
+
 @test fn cut() void = {
 	const c = cut(['a', 'b', 'c'], ['b']);
 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
@@ -150,4 +236,9 @@ export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
 	assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, []));
 	const c = cut([], 'x');
 	assert(equal(c.0, []) && equal(c.1, []));
+
+	const c = rcut(['a', 'b', 'c'], ['b']);
+	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
+	const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b');
+	assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a']));
 };

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE