commit 2529f630402370e81e43322b3fb7df078b7e4339
parent 9f48b024b6ebb12cd95e15d4dbe9432234d54d2a
Author: Bor Grošelj Simić <bor.groseljsimic@telemach.net>
Date: Mon, 22 Feb 2021 01:09:48 +0100
{bytes,strings}/tokenize.ha: implement peeking
Diffstat:
2 files changed, 78 insertions(+), 46 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -1,38 +1,52 @@
+use types;
+
// The state for a tokenizer.
-export type tokenizer = struct { s: []u8, d: []u8, end: bool };
+export type tokenizer = struct { s: []u8, d: []u8, p: size };
// Returns a tokenizer which yields sub-slices tokenized by a delimiter.
-export fn tokenize(s: []u8, delim: []u8) tokenizer = tokenizer {
- s = s,
- d = delim,
- end = false,
+// Caller should ensure delim is not an empty slice
+export fn tokenize(s: []u8, delim: []u8) tokenizer = {
+ assert(len(delim) > 0);
+ return tokenizer {
+ s = s,
+ d = delim,
+ p = types::SIZE_MAX,
+ };
};
// Returns the next slice from a tokenizer, and advances the cursor. Returns
// void if there are no tokens left and on all subsequent invocations. If a
// string starts with, or ends with, a token, an empty slice is returned at the
// beginning or end of the sequence, respectively.
-export fn next_token(s: *tokenizer) ([]u8 | void) = {
- if (s.end) {
+export fn next_token(s: *tokenizer) ([]u8 | void) = match (peek_token(s)) {
+ b: []u8 => {
+ if (s.p == len(s.s)) {
+ s.d = s.d[..0];
+ s.s = s.s[..0];
+ } else {
+ s.s = s.s[s.p + len(s.d)..];
+ };
+ s.p = types::SIZE_MAX;
+ return b;
+ },
+ void => void,
+};
+
+// Same as next_token(), but does not advance the cursor
+export fn peek_token(s: *tokenizer) ([]u8 | void) = {
+ if (len(s.d) == 0) {
return;
};
-
- match (index(s.s, s.d)) {
- i: size => {
- let tok = s.s[..i];
- s.s = s.s[i+len(s.d)..];
- return tok;
- },
- void => {
- s.end = true;
- let tok = s.s[..];
- s.s = s.s[..0];
- return tok;
- },
+ if (s.p > len(s.s)) {
+ s.p = match (index(s.s, s.d)) {
+ i: size => i,
+ void => len(s.s),
+ };
};
-
+ return s.s[..s.p];
};
+
// Returns the remainder of the slice associated with a tokenizer, without doing
// any further tokenization.
export fn remaining_tokens(s: *tokenizer) []u8 = {
@@ -43,41 +57,48 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
let t = tokenize(input, [24, 42]);
- match (next_token(&t)) {
- b: []u8 => assert(equal([1, 2], b)),
- void => abort(),
- };
+ let p = peek_token(&t) as []u8;
+ let n = next_token(&t) as []u8;
+ assert(equal(p, n));
+ assert(equal([1, 2], n));
- match (next_token(&t)) {
- b: []u8 => assert(equal([3, 24], b)),
- void => abort(),
- };
+ p = peek_token(&t) as []u8;
+ n = next_token(&t) as []u8;
+ assert(equal(p, n));
+ assert(equal([3, 24], n));
+ assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
match (next_token(&t)) {
b: []u8 => assert(equal([4, 5], b)),
void => abort(),
};
+ assert(peek_token(&t) is void);
assert(next_token(&t) is void);
const input2: [_]u8 = [24, 42, 1, 24, 42];
t = tokenize(input2, [24, 42]);
+ assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
match (next_token(&t)) {
b: []u8 => assert(equal([], b)),
void => abort(),
};
+ assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
match (next_token(&t)) {
b: []u8 => assert(equal([1], b)),
void => abort(),
};
+ //assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
+ //assert(false);
match (next_token(&t)) {
b: []u8 => assert(equal([], b)),
void => abort(),
};
+ assert(peek_token(&t) is void);
assert(next_token(&t) is void);
const input3: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2];
@@ -113,11 +134,7 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
void => abort(),
};
- match (next_token(&t)) {
- b: []u8 => abort(),
- void => void,
- };
-
+ assert(peek_token(&t) is void);
assert(next_token(&t) is void);
const input5: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4];
@@ -134,4 +151,6 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
};
assert(equal(remaining_tokens(&t), [2, 3, 4]));
+ assert(equal(peek_token(&t) as []u8, [2, 3, 4]));
+ assert(equal(remaining_tokens(&t), [2, 3, 4]));
};
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -23,6 +23,14 @@ export fn next_token(s: *tokenizer) (str | void) = {
};
};
+// Same as next_token(), but does not advance the cursor
+export fn peek_token(s: *tokenizer) (str | void) = {
+ return match (bytes::peek_token(s)) {
+ b: []u8 => from_utf8(b),
+ void => void,
+ };
+};
+
// Returns the remainder of the string associated with a tokenizer, without doing
// any further tokenization.
export fn remaining_tokens(s: *tokenizer) str = {
@@ -41,26 +49,31 @@ export fn remaining_tokens(s: *tokenizer) str = {
void => abort(),
};
- match (next_token(&tok)) {
+ match (peek_token(&tok)) {
s: str => assert(s == "name"),
void => abort(),
};
- assert(remaining_tokens(&tok) == "is drew");
- tok = tokenize("foo", "foo");
- match (next_token(&tok)) {
- s: str => assert(s == ""),
- void => abort(),
- };
match (next_token(&tok)) {
- s: str => assert(s == ""),
+ s: str => assert(s == "name"),
void => abort(),
};
- match (next_token(&tok)) {
- s: str => abort(),
- void => void,
- };
+
+ assert(remaining_tokens(&tok) == "is drew");
+ assert(peek_token(&tok) as str == "is");
+ assert(remaining_tokens(&tok) == "is drew");
+
+ tok = tokenize("foo", "foo");
+
+ assert(peek_token(&tok) as str == "");
+ assert(next_token(&tok) as str == "");
+
+ assert(peek_token(&tok) as str == "");
+ assert(next_token(&tok) as str == "");
+
+ assert(peek_token(&tok) is void);
+ assert(next_token(&tok) is void);
};
// Splits a string into tokens delimited by 'delim', returning a slice of up to