commit 36c9c7c1dbd41dcdcf9448ea45d39339856acf5b
parent 52b3f2d0c7a85e04a79666a954101e527b7f1272
Author: Autumn! <autumnull@posteo.net>
Date: Sun, 12 Feb 2023 21:09:46 +0000
bytes: Add rtokenize and rcut
Signed-off-by: Autumn! <autumnull@posteo.net>
Diffstat:
M | bytes/tokenize.ha | | | 127 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------ |
1 file changed, 109 insertions(+), 18 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -5,13 +5,14 @@
use types;
export type tokenizer = struct {
- s: []u8,
- d: []u8,
- p: size,
+ s: []u8, // string being tokenized
+ d: []u8, // delimiter
+ p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones.
};
-// Returns a tokenizer which yields sub-slices tokenized by a delimiter. The
-// caller must ensure that 'delimiter' is not an empty slice.
+// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting
+// at the beginning of the slice. The caller must ensure that 'delimiter' is not
+// an empty slice. Can tokenize a slice of length less than [[types::I64_MAX]].
export fn tokenize(s: []u8, delim: []u8) tokenizer = {
assert(len(delim) > 0, "bytes::tokenize called with empty slice");
if (len(s) == 0) {
@@ -20,7 +21,25 @@ export fn tokenize(s: []u8, delim: []u8) tokenizer = {
return tokenizer {
s = s,
d = delim,
- p = types::SIZE_MAX,
+ p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet.
+ };
+};
+
+// Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at
+// the end of the slice and moving backwards with each call to [[next_token]]. The
+// caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice
+// of length less than [[types::I64_MAX]].
+export fn rtokenize(s: []u8, delim: []u8) tokenizer = {
+ assert(len(delim) > 0, "bytes::rtokenize called with empty slice");
+ if (len(s) == 0) {
+ delim = [];
+ };
+ return tokenizer {
+ s = s,
+ d = delim,
+ p = types::I64_MIN, // I64_MIN means we haven't peeked the next token yet.
+ // also note that p == -1 corresponds to an index of len(s),
+ // and p == -(1 - len(s)) corresponds to an index of 0.
};
};
@@ -30,13 +49,24 @@ export fn tokenize(s: []u8, delim: []u8) tokenizer = {
// beginning or end of the sequence, respectively.
export fn next_token(s: *tokenizer) ([]u8 | void) = match (peek_token(s)) {
case let b: []u8 =>
- if (s.p == len(s.s)) {
- s.d = s.d[..0];
- s.s = s.s[..0];
+ if (s.p < 0) { // reverse
+ if (len(s.s): i64 + s.p + 1 == 0) {
+ s.d = s.d[..0];
+ s.s = s.s[..0];
+ } else {
+ const end = (len(s.s): i64 + s.p + 1): size - len(s.d);
+ s.s = s.s[..end];
+ };
+ s.p = types::I64_MIN;
} else {
- s.s = s.s[s.p + len(s.d)..];
+ if (s.p == len(s.s): i64) {
+ s.d = s.d[..0];
+ s.s = s.s[..0];
+ } else {
+ s.s = s.s[s.p: size + len(s.d)..];
+ };
+ s.p = types::I64_MAX;
};
- s.p = types::SIZE_MAX;
return b;
case => void;
};
@@ -46,15 +76,27 @@ export fn peek_token(s: *tokenizer) ([]u8 | void) = {
if (len(s.d) == 0) {
return;
};
- if (s.p > len(s.s)) {
- s.p = match (index(s.s, s.d)) {
- case let i: size =>
- yield i;
- case void =>
- yield len(s.s);
+ if (s.p < 0) { // reverse
+ if (s.p == types::I64_MIN) {
+ s.p = match (rindex(s.s, s.d)) {
+ case let i: size =>
+ yield (i + len(s.d)): i64 - len(s.s): i64 - 1;
+ case void =>
+ yield -(len(s.s): i64 + 1);
+ };
};
+ return s.s[len(s.s) + s.p: size + 1..];
+ } else {
+ if (s.p == types::I64_MAX) {
+ s.p = match (index(s.s, s.d)) {
+ case let i: size =>
+ yield i: i64;
+ case void =>
+ yield len(s.s): i64;
+ };
+ };
+ return s.s[..s.p: size];
};
- return s.s[..s.p];
};
@@ -116,6 +158,29 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
t = tokenize([]: []u8, [42]);
assert(peek_token(&t) is void);
assert(next_token(&t) is void);
+
+ const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
+ let t = rtokenize(input, [24, 42]);
+ let p = peek_token(&t) as []u8;
+ let n = next_token(&t) as []u8;
+ assert(equal(p, n));
+ assert(equal([4, 5], n));
+ let p = peek_token(&t) as []u8;
+ let n = next_token(&t) as []u8;
+ assert(equal(p, n));
+ assert(equal([3, 24], n));
+ assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
+ assert(equal([1, 2], next_token(&t) as []u8));
+ assert(peek_token(&t) is void);
+ assert(next_token(&t) is void);
+
+ const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42];
+ t = rtokenize(input, [24, 42]);
+ assert(equal([], next_token(&t) as []u8));
+ assert(equal([4], next_token(&t) as []u8));
+ assert(equal(remaining_tokens(&t), [1, 2, 3]));
+ assert(equal(peek_token(&t) as []u8, [1, 2, 3]));
+ assert(equal(remaining_tokens(&t), [1, 2, 3]));
};
// Returns the input slice "cut" along the first instance of a delimiter,
@@ -139,6 +204,27 @@ export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
};
};
+// Returns the input slice "cut" along the last instance of a delimiter,
+// returning everything up to the delimiter, and everything after the delimiter,
+// in a tuple. The contents are borrowed from the input slice.
+//
+// The caller must ensure that 'delimiter' is not an empty slice.
+export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
+ let ln = if (delim is u8) {
+ yield 1z;
+ } else {
+ let ln = len(delim: []u8);
+ assert(ln > 0, "bytes::rcut called with empty delimiter");
+ yield ln;
+ };
+ match (rindex(in, delim)) {
+ case let i: size =>
+ return (in[..i], in[i + ln..]);
+ case void =>
+ return (in, []);
+ };
+};
+
@test fn cut() void = {
const c = cut(['a', 'b', 'c'], ['b']);
assert(equal(c.0, ['a']) && equal(c.1, ['c']));
@@ -150,4 +236,9 @@ export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, []));
const c = cut([], 'x');
assert(equal(c.0, []) && equal(c.1, []));
+
+ const c = rcut(['a', 'b', 'c'], ['b']);
+ assert(equal(c.0, ['a']) && equal(c.1, ['c']));
+ const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b');
+ assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a']));
};