hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

tokenize.ha (7905B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use types;
      5 
      6 export type tokenizer = struct {
      7 	s: []u8, // string being tokenized
      8 	d: []u8, // delimiter
      9 	p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones.
     10 };
     11 
     12 // Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting
     13 // at the beginning of the slice. The caller must ensure that 'delim' is not an
     14 // empty slice. Can tokenize a slice of length less than [[types::I64_MAX]].
     15 export fn tokenize(s: []u8, delim: []u8) tokenizer = {
     16 	assert(len(delim) > 0, "bytes::tokenize called with empty slice");
     17 	if (len(s) == 0) {
     18 		delim = [];
     19 	};
     20 	return tokenizer {
     21 		s = s,
     22 		d = delim,
     23 		p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet.
     24 	};
     25 };
     26 
     27 // Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at
     28 // the end of the slice and moving backwards with each call to [[next_token]]. The
     29 // caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice
     30 // of length less than [[types::I64_MAX]].
     31 export fn rtokenize(s: []u8, delim: []u8) tokenizer = {
     32 	assert(len(delim) > 0, "bytes::rtokenize called with empty slice");
     33 	if (len(s) == 0) {
     34 		delim = [];
     35 	};
     36 	return tokenizer {
     37 		s = s,
     38 		d = delim,
     39 		p = types::I64_MIN, // I64_MIN means we haven't peeked the next token yet.
     40 		// also note that p == -1 corresponds to an index of len(s),
     41 		// and p == -(1 - len(s)) corresponds to an index of 0.
     42 	};
     43 };
     44 
     45 // Returns the next slice from a tokenizer, and advances the cursor. Returns
     46 // void if there are no tokens left and on all subsequent invocations. If a
     47 // string starts with, or ends with, a token, an empty slice is returned at the
     48 // beginning or end of the sequence, respectively.
     49 export fn next_token(s: *tokenizer) ([]u8 | void) = {
     50 	const b = match (peek_token(s)) {
     51 	case let b: []u8 =>
     52 		yield b;
     53 	case => return;
     54 	};
     55 
     56 	if (s.p < 0) { // reverse
     57 		if (len(s.s): i64 + s.p + 1 == 0) {
     58 			s.d = s.d[..0];
     59 			s.s = s.s[..0];
     60 		} else {
     61 			const end = (len(s.s): i64 + s.p + 1): size - len(s.d);
     62 			s.s = s.s[..end];
     63 		};
     64 		s.p = types::I64_MIN;
     65 	} else {
     66 		if (s.p == len(s.s): i64) {
     67 			s.d = s.d[..0];
     68 			s.s = s.s[..0];
     69 		} else {
     70 			s.s = s.s[s.p: size + len(s.d)..];
     71 		};
     72 		s.p = types::I64_MAX;
     73 	};
     74 
     75 	return b;
     76 };
     77 
     78 // Same as [[next_token]], but does not advance the cursor
     79 export fn peek_token(s: *tokenizer) ([]u8 | void) = {
     80 	if (len(s.d) == 0) {
     81 		return;
     82 	};
     83 
     84 	const reverse = s.p < 0;
     85 	const ifunc = if (reverse) &rindex else &index;
     86 
     87 	const known = ((s.p < 0 && s.p != types::I64_MIN) ||
     88 		(s.p >= 0 && s.p != types::I64_MAX));
     89 	if (!known) {
     90 		let i = 0i64;
     91 		let dlen = 0i64;
     92 		let slen = len(s.s): i64;
     93 
     94 		match (ifunc(s.s, s.d)) {
     95 		case let ix: size =>
     96 			dlen = len(s.d): i64;
     97 			i = ix: i64;
     98 		case void =>
     99 			i = slen;
    100 		};
    101 
    102 		if (reverse) {
    103 			if (i == slen) {
    104 				s.p = -(slen + 1);
    105 			} else {
    106 				s.p = i + dlen - slen - 1;
    107 			};
    108 		} else {
    109 			s.p = i;
    110 		};
    111 	};
    112 
    113 	if (reverse) {
    114 		return s.s[len(s.s) + s.p: size + 1..];
    115 	} else {
    116 		return s.s[..s.p: size];
    117 	};
    118 };
    119 
    120 
    121 // Returns the remainder of the slice associated with a tokenizer, without doing
    122 // any further tokenization.
    123 export fn remaining_tokens(s: *tokenizer) []u8 = {
    124 	return s.s;
    125 };
    126 
    127 @test fn tokenize() void = {
    128 	const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
    129 	let t = tokenize(input, [24, 42]);
    130 	let p = peek_token(&t) as []u8;
    131 	let n = next_token(&t) as []u8;
    132 	assert(equal(p, n));
    133 	assert(equal([1, 2], n));
    134 	let p = peek_token(&t) as []u8;
    135 	let n = next_token(&t) as []u8;
    136 	assert(equal(p, n));
    137 	assert(equal([3, 24], n));
    138 	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
    139 	assert(equal([4, 5], next_token(&t) as []u8));
    140 	assert(peek_token(&t) is void);
    141 	assert(next_token(&t) is void);
    142 
    143 	const input: [_]u8 = [24, 42, 1, 24, 42];
    144 	t = tokenize(input, [24, 42]);
    145 	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
    146 	assert(equal([], next_token(&t) as []u8));
    147 	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
    148 	assert(equal([1], next_token(&t) as []u8));
    149 	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
    150 	assert(equal([], next_token(&t) as []u8));
    151 	assert(peek_token(&t) is void);
    152 	assert(next_token(&t) is void);
    153 
    154 	const input: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2];
    155 	t = tokenize(input, [1, 2]);
    156 	assert(equal([1, 1], next_token(&t) as []u8));
    157 	assert(equal([1], next_token(&t) as []u8));
    158 	assert(equal([2], next_token(&t) as []u8));
    159 	assert(next_token(&t) is void);
    160 
    161 	const input: [_]u8 = [1, 2];
    162 	t = tokenize(input, [1, 2]);
    163 	assert(equal([], next_token(&t) as []u8));
    164 	assert(equal([], next_token(&t) as []u8));
    165 	assert(peek_token(&t) is void);
    166 	assert(next_token(&t) is void);
    167 
    168 	const input: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4];
    169 	t = tokenize(input, [24, 42]);
    170 	assert(equal([], next_token(&t) as []u8));
    171 	assert(equal([1], next_token(&t) as []u8));
    172 	assert(equal(remaining_tokens(&t), [2, 3, 4]));
    173 	assert(equal(peek_token(&t) as []u8, [2, 3, 4]));
    174 	assert(equal(remaining_tokens(&t), [2, 3, 4]));
    175 
    176 	t = tokenize([], [42]);
    177 	assert(peek_token(&t) is void);
    178 	assert(next_token(&t) is void);
    179 
    180 	const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5];
    181 	let t = rtokenize(input, [24, 42]);
    182 	let p = peek_token(&t) as []u8;
    183 	let n = next_token(&t) as []u8;
    184 	assert(equal(p, n));
    185 	assert(equal([4, 5], n));
    186 	let p = peek_token(&t) as []u8;
    187 	let n = next_token(&t) as []u8;
    188 	assert(equal(p, n));
    189 	assert(equal([3, 24], n));
    190 	assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8));
    191 	assert(equal([1, 2], next_token(&t) as []u8));
    192 	assert(peek_token(&t) is void);
    193 	assert(next_token(&t) is void);
    194 
    195 	const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42];
    196 	t = rtokenize(input, [24, 42]);
    197 	assert(equal([], next_token(&t) as []u8));
    198 	assert(equal([4], next_token(&t) as []u8));
    199 	assert(equal(remaining_tokens(&t), [1, 2, 3]));
    200 	assert(equal(peek_token(&t) as []u8, [1, 2, 3]));
    201 	assert(equal(remaining_tokens(&t), [1, 2, 3]));
    202 };
    203 
    204 // Returns the input slice "cut" along the first instance of a delimiter,
    205 // returning everything up to the delimiter, and everything after the delimiter,
    206 // in a tuple. The contents are borrowed from the input slice.
    207 //
    208 // The caller must ensure that 'delimiter' is not an empty slice.
    209 export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
    210 	let ln = if (delim is u8) {
    211 		yield 1z;
    212 	} else {
    213 		let ln = len(delim: []u8);
    214 		assert(ln > 0, "bytes::cut called with empty delimiter");
    215 		yield ln;
    216 	};
    217 	match (index(in, delim)) {
    218 	case let i: size =>
    219 		return (in[..i], in[i + ln..]);
    220 	case void =>
    221 		return (in, []);
    222 	};
    223 };
    224 
    225 // Returns the input slice "cut" along the last instance of a delimiter,
    226 // returning everything up to the delimiter, and everything after the delimiter,
    227 // in a tuple. The contents are borrowed from the input slice.
    228 //
    229 // The caller must ensure that 'delimiter' is not an empty slice.
    230 export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = {
    231 	let ln = if (delim is u8) {
    232 		yield 1z;
    233 	} else {
    234 		let ln = len(delim: []u8);
    235 		assert(ln > 0, "bytes::rcut called with empty delimiter");
    236 		yield ln;
    237 	};
    238 	match (rindex(in, delim)) {
    239 	case let i: size =>
    240 		return (in[..i], in[i + ln..]);
    241 	case void =>
    242 		return (in, []);
    243 	};
    244 };
    245 
    246 @test fn cut() void = {
    247 	const c = cut(['a', 'b', 'c'], ['b']);
    248 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
    249 	const c = cut(['a', 'b', 'c'], 'b');
    250 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
    251 	const c = cut(['a', 'b', 'c', 'b', 'a'], 'b');
    252 	assert(equal(c.0, ['a']) && equal(c.1, ['c', 'b', 'a']));
    253 	const c = cut(['a', 'b', 'c'], 'x');
    254 	assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, []));
    255 	const c = cut([], 'x');
    256 	assert(equal(c.0, []) && equal(c.1, []));
    257 
    258 	const c = rcut(['a', 'b', 'c'], ['b']);
    259 	assert(equal(c.0, ['a']) && equal(c.1, ['c']));
    260 	const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b');
    261 	assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a']));
    262 };