{bytes,strings}/tokenize.ha: implement peeking - hare

commit 2529f630402370e81e43322b3fb7df078b7e4339
parent 9f48b024b6ebb12cd95e15d4dbe9432234d54d2a
Author: Bor Grošelj Simić <bor.groseljsimic@telemach.net>
Date:   Mon, 22 Feb 2021 01:09:48 +0100

{bytes,strings}/tokenize.ha: implement peeking

Diffstat:
M bytes/tokenize.ha  | 87 ++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
M strings/tokenize.ha  | 37 +++++++++++++++++++++++++------------

2 files changed, 78 insertions(+), 46 deletions(-)
diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha
@@ -1,38 +1,52 @@
+use types;
+
 // The state for a tokenizer.
-export type tokenizer = struct { s: []u8, d: []u8, end: bool };
+export type tokenizer = struct { s: []u8, d: []u8, p: size };
 
 // Returns a tokenizer which yields sub-slices tokenized by a delimiter.
-export fn tokenize(s: []u8, delim: []u8) tokenizer = tokenizer {
-	s = s,
-	d = delim,
-	end = false,
+// Caller should ensure delim is not an empty slice
+export fn tokenize(s: []u8, delim: []u8) tokenizer = {
+	assert(len(delim) > 0);
+	return tokenizer {
+		s = s,
+		d = delim,
+		p = types::SIZE_MAX,
+	};
 };
 
 // Returns the next slice from a tokenizer, and advances the cursor. Returns
 // void if there are no tokens left and on all subsequent invocations. If a
 // string starts with, or ends with, a token, an empty slice is returned at the
 // beginning or end of the sequence, respectively.
-export fn next_token(s: *tokenizer) ([]u8 | void) = {
-	if (s.end) {
+export fn next_token(s: *tokenizer) ([]u8 | void) = match (peek_token(s)) {
+	b: []u8 => {
+		if (s.p == len(s.s)) {
+			s.d = s.d[..0];
+			s.s = s.s[..0];
+		} else {
+			s.s = s.s[s.p + len(s.d)..];
+		};
+		s.p = types::SIZE_MAX;
+		return b;
+	},
+	void => void,
+};
+
+// Same as next_token(), but does not advance the cursor
+export fn peek_token(s: *tokenizer) ([]u8 | void) = {
+	if (len(s.d) == 0) {
 		return;
 	};
-
-	match (index(s.s, s.d)) {
-		i: size => {
-			let tok = s.s[..i];
-			s.s = s.s[i+len(s.d)..];
-			return tok;
-		},
-		void => {
-			s.end = true;
-			let tok = s.s[..];
-			s.s = s.s[..0];
-			return tok;
-		},
+	if (s.p > len(s.s)) {
+		s.p = match (index(s.s, s.d)) {
+			i: size => i,
+			void => len(s.s),
+		};
 	};
-
+	return s.s[..s.p];
 };
 
+
 // Returns the remainder of the slice associated with a tokenizer, without doing
 // any further tokenization.
 export fn remaining_tokens(s: *tokenizer) []u8 = {
@@ -43,41 +57,48 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
 	const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
 	let t = tokenize(input, [24, 42]);
 
-	match (next_token(&t)) {
-		b: []u8 => assert(equal([1, 2], b)),
-		void    => abort(),
-	};
+	let p = peek_token(&t) as []u8;
+	let n = next_token(&t) as []u8;
+	assert(equal(p, n));
+	assert(equal([1, 2], n));
 
-	match (next_token(&t)) {
-		b: []u8 => assert(equal([3, 24], b)),
-		void    => abort(),
-	};
+	p = peek_token(&t) as []u8;
+	n = next_token(&t) as []u8;
+	assert(equal(p, n));
+	assert(equal([3, 24], n));
 
+	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
 	match (next_token(&t)) {
 		b: []u8 => assert(equal([4, 5], b)),
 		void    => abort(),
 	};
 
+	assert(peek_token(&t) is void);
 	assert(next_token(&t) is void);
 
 	const input2: [_]u8 = [24, 42, 1, 24, 42];
 	t = tokenize(input2, [24, 42]);
 
+	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
 	match (next_token(&t)) {
 		b: []u8 => assert(equal([], b)),
 		void    => abort(),
 	};
 
+	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
 	match (next_token(&t)) {
 		b: []u8 => assert(equal([1], b)),
 		void    => abort(),
 	};
 
+	//assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
+	//assert(false);
 	match (next_token(&t)) {
 		b: []u8 => assert(equal([], b)),
 		void    => abort(),
 	};
 
+	assert(peek_token(&t) is void);
 	assert(next_token(&t) is void);
 
 	const input3: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2];
@@ -113,11 +134,7 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
 		void    => abort(),
 	};
 
-	match (next_token(&t)) {
-		b: []u8 => abort(),
-		void    => void,
-	};
-
+	assert(peek_token(&t) is void);
 	assert(next_token(&t) is void);
 
 	const input5: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4];
@@ -134,4 +151,6 @@ export fn remaining_tokens(s: *tokenizer) []u8 = {
 	};
 
 	assert(equal(remaining_tokens(&t), [2, 3, 4]));
+	assert(equal(peek_token(&t) as []u8, [2, 3, 4]));
+	assert(equal(remaining_tokens(&t), [2, 3, 4]));
 };
diff --git a/strings/tokenize.ha b/strings/tokenize.ha
@@ -23,6 +23,14 @@ export fn next_token(s: *tokenizer) (str | void) = {
 	};
 };
 
+// Same as next_token(), but does not advance the cursor
+export fn peek_token(s: *tokenizer) (str | void) = {
+	return match (bytes::peek_token(s)) {
+		b: []u8 => from_utf8(b),
+		void => void,
+	};
+};
+
 // Returns the remainder of the string associated with a tokenizer, without doing
 // any further tokenization.
 export fn remaining_tokens(s: *tokenizer) str = {
@@ -41,26 +49,31 @@ export fn remaining_tokens(s: *tokenizer) str = {
 		void   => abort(),
 	};
 
-	match (next_token(&tok)) {
+	match (peek_token(&tok)) {
 		s: str => assert(s == "name"),
 		void   => abort(),
 	};
 
-	assert(remaining_tokens(&tok) == "is drew");
 
-	tok = tokenize("foo", "foo");
-	match (next_token(&tok)) {
-		s: str => assert(s == ""),
-		void   => abort(),
-	};
 	match (next_token(&tok)) {
-		s: str => assert(s == ""),
+		s: str => assert(s == "name"),
 		void   => abort(),
 	};
-	match (next_token(&tok)) {
-		s: str => abort(),
-		void   => void,
-	};
+
+	assert(remaining_tokens(&tok) == "is drew");
+	assert(peek_token(&tok) as str == "is");
+	assert(remaining_tokens(&tok) == "is drew");
+
+	tok = tokenize("foo", "foo");
+
+	assert(peek_token(&tok) as str == "");
+	assert(next_token(&tok) as str == "");
+
+	assert(peek_token(&tok) as str == "");
+	assert(next_token(&tok) as str == "");
+
+	assert(peek_token(&tok) is void);
+	assert(next_token(&tok) is void);
 };
 
 // Splits a string into tokens delimited by 'delim', returning a slice of up to

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

M	bytes/tokenize.ha	\|	87	++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
M	strings/tokenize.ha	\|	37	+++++++++++++++++++++++++------------