hare

The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 2529f630402370e81e43322b3fb7df078b7e4339
parent 9f48b024b6ebb12cd95e15d4dbe9432234d54d2a
Author: Bor Grošelj Simić <bor.groseljsimic@telemach.net>
Date:   Mon, 22 Feb 2021 01:09:48 +0100

{bytes,strings}/tokenize.ha: implement peeking

Diffstat:
Mbytes/tokenize.ha | 87++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------
Mstrings/tokenize.ha | 37+++++++++++++++++++++++++------------
2 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/bytes/tokenize.ha b/bytes/tokenize.ha @@ -1,38 +1,52 @@ +use types; + // The state for a tokenizer. -export type tokenizer = struct { s: []u8, d: []u8, end: bool }; +export type tokenizer = struct { s: []u8, d: []u8, p: size }; // Returns a tokenizer which yields sub-slices tokenized by a delimiter. -export fn tokenize(s: []u8, delim: []u8) tokenizer = tokenizer { - s = s, - d = delim, - end = false, +// Caller should ensure delim is not an empty slice +export fn tokenize(s: []u8, delim: []u8) tokenizer = { + assert(len(delim) > 0); + return tokenizer { + s = s, + d = delim, + p = types::SIZE_MAX, + }; }; // Returns the next slice from a tokenizer, and advances the cursor. Returns // void if there are no tokens left and on all subsequent invocations. If a // string starts with, or ends with, a token, an empty slice is returned at the // beginning or end of the sequence, respectively. -export fn next_token(s: *tokenizer) ([]u8 | void) = { - if (s.end) { +export fn next_token(s: *tokenizer) ([]u8 | void) = match (peek_token(s)) { + b: []u8 => { + if (s.p == len(s.s)) { + s.d = s.d[..0]; + s.s = s.s[..0]; + } else { + s.s = s.s[s.p + len(s.d)..]; + }; + s.p = types::SIZE_MAX; + return b; + }, + void => void, +}; + +// Same as next_token(), but does not advance the cursor +export fn peek_token(s: *tokenizer) ([]u8 | void) = { + if (len(s.d) == 0) { return; }; - - match (index(s.s, s.d)) { - i: size => { - let tok = s.s[..i]; - s.s = s.s[i+len(s.d)..]; - return tok; - }, - void => { - s.end = true; - let tok = s.s[..]; - s.s = s.s[..0]; - return tok; - }, + if (s.p > len(s.s)) { + s.p = match (index(s.s, s.d)) { + i: size => i, + void => len(s.s), + }; }; - + return s.s[..s.p]; }; + // Returns the remainder of the slice associated with a tokenizer, without doing // any further tokenization. export fn remaining_tokens(s: *tokenizer) []u8 = { @@ -43,41 +57,48 @@ export fn remaining_tokens(s: *tokenizer) []u8 = { const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5]; let t = tokenize(input, [24, 42]); - match (next_token(&t)) { - b: []u8 => assert(equal([1, 2], b)), - void => abort(), - }; + let p = peek_token(&t) as []u8; + let n = next_token(&t) as []u8; + assert(equal(p, n)); + assert(equal([1, 2], n)); - match (next_token(&t)) { - b: []u8 => assert(equal([3, 24], b)), - void => abort(), - }; + p = peek_token(&t) as []u8; + n = next_token(&t) as []u8; + assert(equal(p, n)); + assert(equal([3, 24], n)); + assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); match (next_token(&t)) { b: []u8 => assert(equal([4, 5], b)), void => abort(), }; + assert(peek_token(&t) is void); assert(next_token(&t) is void); const input2: [_]u8 = [24, 42, 1, 24, 42]; t = tokenize(input2, [24, 42]); + assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); match (next_token(&t)) { b: []u8 => assert(equal([], b)), void => abort(), }; + assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); match (next_token(&t)) { b: []u8 => assert(equal([1], b)), void => abort(), }; + //assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); + //assert(false); match (next_token(&t)) { b: []u8 => assert(equal([], b)), void => abort(), }; + assert(peek_token(&t) is void); assert(next_token(&t) is void); const input3: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2]; @@ -113,11 +134,7 @@ export fn remaining_tokens(s: *tokenizer) []u8 = { void => abort(), }; - match (next_token(&t)) { - b: []u8 => abort(), - void => void, - }; - + assert(peek_token(&t) is void); assert(next_token(&t) is void); const input5: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4]; @@ -134,4 +151,6 @@ export fn remaining_tokens(s: *tokenizer) []u8 = { }; assert(equal(remaining_tokens(&t), [2, 3, 4])); + assert(equal(peek_token(&t) as []u8, [2, 3, 4])); + assert(equal(remaining_tokens(&t), [2, 3, 4])); }; diff --git a/strings/tokenize.ha b/strings/tokenize.ha @@ -23,6 +23,14 @@ export fn next_token(s: *tokenizer) (str | void) = { }; }; +// Same as next_token(), but does not advance the cursor +export fn peek_token(s: *tokenizer) (str | void) = { + return match (bytes::peek_token(s)) { + b: []u8 => from_utf8(b), + void => void, + }; +}; + // Returns the remainder of the string associated with a tokenizer, without doing // any further tokenization. export fn remaining_tokens(s: *tokenizer) str = { @@ -41,26 +49,31 @@ export fn remaining_tokens(s: *tokenizer) str = { void => abort(), }; - match (next_token(&tok)) { + match (peek_token(&tok)) { s: str => assert(s == "name"), void => abort(), }; - assert(remaining_tokens(&tok) == "is drew"); - tok = tokenize("foo", "foo"); - match (next_token(&tok)) { - s: str => assert(s == ""), - void => abort(), - }; match (next_token(&tok)) { - s: str => assert(s == ""), + s: str => assert(s == "name"), void => abort(), }; - match (next_token(&tok)) { - s: str => abort(), - void => void, - }; + + assert(remaining_tokens(&tok) == "is drew"); + assert(peek_token(&tok) as str == "is"); + assert(remaining_tokens(&tok) == "is drew"); + + tok = tokenize("foo", "foo"); + + assert(peek_token(&tok) as str == ""); + assert(next_token(&tok) as str == ""); + + assert(peek_token(&tok) as str == ""); + assert(next_token(&tok) as str == ""); + + assert(peek_token(&tok) is void); + assert(next_token(&tok) is void); }; // Splits a string into tokens delimited by 'delim', returning a slice of up to