hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

tokenize.ha (7547B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use types;
      5 
      6 export type tokenizer = struct {
      7 	in: []u8,	// string being tokenized
      8 	delim: []u8,	// delimiter
      9 	p: i64,		// p < 0 for reverse tokenizers, 0 <= p for forward ones.
     10 };
     11 
     12 // Tokenizes a byte slice, returning an iterator that yields tokens from the
     13 // slice delimited by one of any number of delimiter bytes. If the input slice
     14 // begins with or ends with a delimiter, an empty slice is returned respectively
     15 // as the first and last call to [[next_token]].
     16 //
     17 // The variadic argument slice is borrowed from the caller, who should take care
     18 // to ensure that it is valid for the lifetime of the tokenizer.
     19 //
     20 // The caller must ensure that at least one delimiter is provided and that the
     21 // length of the slice is less than [[types::I64_MAX]].
     22 export fn tokenize(in: []u8, delim: u8...) tokenizer = {
     23 	assert(len(delim) > 0, "bytes::tokenize called with empty slice");
     24 	assert(len(in) < types::I64_MAX: size,
     25 		"bytes::tokenize: input length exceeds I64_MAX");
     26 	if (len(in) == 0) {
     27 		delim = [];
     28 	};
     29 
     30 	return tokenizer {
     31 		in = in,
     32 		delim = delim,
     33 		p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet.
     34 	};
     35 };
     36 
     37 // Like [[tokenize]], but tokenizes the slice in reverse, such that the first
     38 // call to [[next_token]] returns the last token and the last call returns the
     39 // first token.
     40 export fn rtokenize(in: []u8, delim: u8...) tokenizer = {
     41 	assert(len(delim) > 0, "bytes::rtokenize called with empty slice");
     42 	assert(len(in) < types::I64_MAX: size,
     43 		"bytes::rtokenize: input length exceeds I64_MAX");
     44 	if (len(in) == 0) {
     45 		delim = [];
     46 	};
     47 
     48 	return tokenizer {
     49 		in = in,
     50 		delim = delim,
     51 		// I64_MIN means we haven't peeked the next token yet. Note that
     52 		// p == -1 corresponds to an index of len(s), and
     53 		// p == -(1 - len(s)) corresponds to an index of 0.
     54 		p = types::I64_MIN,
     55 	};
     56 };
     57 
     58 // Returns the next token from a [[tokenizer]] and advances the cursor.
     59 export fn next_token(s: *tokenizer) ([]u8 | done) = {
     60 	const b = match (peek_token(s)) {
     61 	case let b: []u8 =>
     62 		yield b;
     63 	case done => return done;
     64 	};
     65 
     66 	const slen = len(s.in): i64;
     67 	const reverse = s.p < 0;
     68 	if (reverse) {
     69 		if (slen + s.p + 1 == 0) {
     70 			s.delim = s.delim[..0];
     71 			s.in = s.in[..0];
     72 		} else {
     73 			const end = (slen + s.p + 1): size - 1;
     74 			s.in = s.in[..end];
     75 		};
     76 		s.p = types::I64_MIN;
     77 	} else {
     78 		if (s.p == slen) {
     79 			s.delim = s.delim[..0];
     80 			s.in = s.in[..0];
     81 		} else {
     82 			s.in = s.in[s.p: size + 1..];
     83 		};
     84 		s.p = types::I64_MAX;
     85 	};
     86 
     87 	return b;
     88 };
     89 
     90 // Returns the next token from a [[tokenizer]] without advancing the cursor.
     91 export fn peek_token(s: *tokenizer) ([]u8 | done) = {
     92 	if (len(s.delim) == 0) {
     93 		return done;
     94 	};
     95 
     96 	const reverse = s.p < 0;
     97 	const ifunc = if (reverse) &rindex else &index;
     98 
     99 	const known = ((reverse && s.p != types::I64_MIN) ||
    100 		(!reverse && s.p != types::I64_MAX));
    101 	if (!known) {
    102 		let i = if (reverse) types::I64_MIN else types::I64_MAX;
    103 		let dlen = 0i64;
    104 		const slen = len(s.in): i64;
    105 
    106 		for (let d .. s.delim) {
    107 			match (ifunc(s.in, d)) {
    108 			case let ix: size =>
    109 				if (!reverse && ix: i64 < i) {
    110 					i = ix: i64;
    111 					dlen = 1;
    112 				} else if (reverse && ix: i64 > i) {
    113 					i = ix: i64;
    114 					dlen = 1;
    115 				};
    116 			case void =>
    117 				if (!reverse && slen < i: i64) {
    118 					i = slen;
    119 				} else if (reverse && 0 > i: i64) {
    120 					i = 0;
    121 				};
    122 			};
    123 		};
    124 
    125 		if (reverse) {
    126 			if (i == slen) {
    127 				s.p = -(slen + 1);
    128 			} else {
    129 				s.p = i + dlen - slen - 1;
    130 			};
    131 		} else {
    132 			s.p = i;
    133 		};
    134 	};
    135 
    136 	if (reverse) {
    137 		return s.in[len(s.in) + s.p: size + 1..];
    138 	} else {
    139 		return s.in[..s.p: size];
    140 	};
    141 };
    142 
    143 // Returns the remainder of the input slice from a [[tokenizer]] ahead of the
    144 // token cursor.
    145 export fn remaining_tokens(s: *tokenizer) []u8 = {
    146 	return s.in;
    147 };
    148 
    149 fn tokenize_test(
    150 	func: *fn([]u8, u8...) tokenizer,
    151 	testcase: str,
    152 	in: []u8,
    153 	delim: []u8,
    154 	tokens: [][]u8,
    155 	iters: size = types::SIZE_MAX,
    156 ) tokenizer = {
    157 	const tok = func(in, delim...);
    158 	let n = 0z;
    159 	for (const want .. tokens) {
    160 		if (n >= iters) {
    161 			return tok;
    162 		};
    163 		n += 1;
    164 
    165 		const p = peek_token(&tok) as []u8;
    166 		const n = next_token(&tok) as []u8;
    167 		assert(equal(p, n), testcase);
    168 		assert(equal(n, want), testcase);
    169 	};
    170 
    171 	if (n >= iters) {
    172 		return tok;
    173 	};
    174 
    175 	assert(peek_token(&tok) is done, testcase);
    176 	assert(next_token(&tok) is done, testcase);
    177 	return tok;
    178 };
    179 
    180 @test fn tokenize() void = {
    181 	tokenize_test(&tokenize, "simple case", [1, 2, 0, 3, 4], [0], [
    182 		[1, 2],
    183 		[3, 4],
    184 	]);
    185 
    186 	tokenize_test(&tokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [
    187 		[1, 2],
    188 		[3, 4],
    189 		[5, 6],
    190 	]);
    191 
    192 	tokenize_test(&tokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [
    193 		[1, 2],
    194 		[],
    195 		[],
    196 		[3, 4],
    197 	]);
    198 
    199 	tokenize_test(&tokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [
    200 		[],
    201 		[1, 2, 3],
    202 		[],
    203 	]);
    204 
    205 	const tok = tokenize_test(&tokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [
    206 		[1, 2],
    207 	], 1);
    208 	assert(equal(remaining_tokens(&tok), [3, 4]));
    209 };
    210 
    211 @test fn rtokenize() void = {
    212 	tokenize_test(&rtokenize, "simple case", [1, 2, 0, 3, 4], [0], [
    213 		[3, 4],
    214 		[1, 2],
    215 	]);
    216 
    217 	tokenize_test(&rtokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [
    218 		[5, 6],
    219 		[3, 4],
    220 		[1, 2],
    221 	]);
    222 
    223 	tokenize_test(&rtokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [
    224 		[3, 4],
    225 		[],
    226 		[],
    227 		[1, 2],
    228 	]);
    229 
    230 	tokenize_test(&rtokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [
    231 		[],
    232 		[1, 2, 3],
    233 		[],
    234 	]);
    235 
    236 	const tok = tokenize_test(&rtokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [
    237 		[3, 4],
    238 	], 1);
    239 	assert(equal(remaining_tokens(&tok), [1, 2]));
    240 };
    241 
    242 // Returns the input slice "cut" along the first instance of a delimiter,
    243 // returning everything up to the delimiter, and everything after the delimiter,
    244 // in a tuple. The contents are borrowed from the input slice.
    245 //
    246 // The caller must ensure that 'delimiter' is not an empty slice.
    247 export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
    248 	let ln = if (delim is u8) {
    249 		yield 1z;
    250 	} else {
    251 		let ln = len(delim: []u8);
    252 		assert(ln > 0, "bytes::cut called with empty delimiter");
    253 		yield ln;
    254 	};
    255 	match (index(in, delim)) {
    256 	case let i: size =>
    257 		return (in[..i], in[i + ln..]);
    258 	case void =>
    259 		return (in, []);
    260 	};
    261 };
    262 
    263 // Returns the input slice "cut" along the last instance of a delimiter,
    264 // returning everything up to the delimiter, and everything after the delimiter,
    265 // in a tuple. The contents are borrowed from the input slice.
    266 //
    267 // The caller must ensure that 'delimiter' is not an empty slice.
    268 export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
    269 	let ln = if (delim is u8) {
    270 		yield 1z;
    271 	} else {
    272 		let ln = len(delim: []u8);
    273 		assert(ln > 0, "bytes::rcut called with empty delimiter");
    274 		yield ln;
    275 	};
    276 	match (rindex(in, delim)) {
    277 	case let i: size =>
    278 		return (in[..i], in[i + ln..]);
    279 	case void =>
    280 		return (in, []);
    281 	};
    282 };
    283 
    284 @test fn cut() void = {
    285 	const c = cut(['a', 'b', 'c'], ['b']);
    286 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
    287 	const c = cut(['a', 'b', 'c'], 'b');
    288 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
    289 	const c = cut(['a', 'b', 'c', 'b', 'a'], 'b');
    290 	assert(equal(c.0, ['a']) && equal(c.1, ['c', 'b', 'a']));
    291 	const c = cut(['a', 'b', 'c'], 'x');
    292 	assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, []));
    293 	const c = cut([], 'x');
    294 	assert(equal(c.0, []) && equal(c.1, []));
    295 
    296 	const c = rcut(['a', 'b', 'c'], ['b']);
    297 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
    298 	const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b');
    299 	assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a']));
    300 };