tokenize.ha (7547B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use types; 5 6 export type tokenizer = struct { 7 in: []u8, // string being tokenized 8 delim: []u8, // delimiter 9 p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones. 10 }; 11 12 // Tokenizes a byte slice, returning an iterator that yields tokens from the 13 // slice delimited by one of any number of delimiter bytes. If the input slice 14 // begins with or ends with a delimiter, an empty slice is returned respectively 15 // as the first and last call to [[next_token]]. 16 // 17 // The variadic argument slice is borrowed from the caller, who should take care 18 // to ensure that it is valid for the lifetime of the tokenizer. 19 // 20 // The caller must ensure that at least one delimiter is provided and that the 21 // length of the slice is less than [[types::I64_MAX]]. 22 export fn tokenize(in: []u8, delim: u8...) tokenizer = { 23 assert(len(delim) > 0, "bytes::tokenize called with empty slice"); 24 assert(len(in) < types::I64_MAX: size, 25 "bytes::tokenize: input length exceeds I64_MAX"); 26 if (len(in) == 0) { 27 delim = []; 28 }; 29 30 return tokenizer { 31 in = in, 32 delim = delim, 33 p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet. 34 }; 35 }; 36 37 // Like [[tokenize]], but tokenizes the slice in reverse, such that the first 38 // call to [[next_token]] returns the last token and the last call returns the 39 // first token. 40 export fn rtokenize(in: []u8, delim: u8...) tokenizer = { 41 assert(len(delim) > 0, "bytes::rtokenize called with empty slice"); 42 assert(len(in) < types::I64_MAX: size, 43 "bytes::rtokenize: input length exceeds I64_MAX"); 44 if (len(in) == 0) { 45 delim = []; 46 }; 47 48 return tokenizer { 49 in = in, 50 delim = delim, 51 // I64_MIN means we haven't peeked the next token yet. Note that 52 // p == -1 corresponds to an index of len(s), and 53 // p == -(1 - len(s)) corresponds to an index of 0. 54 p = types::I64_MIN, 55 }; 56 }; 57 58 // Returns the next token from a [[tokenizer]] and advances the cursor. 59 export fn next_token(s: *tokenizer) ([]u8 | done) = { 60 const b = match (peek_token(s)) { 61 case let b: []u8 => 62 yield b; 63 case done => return done; 64 }; 65 66 const slen = len(s.in): i64; 67 const reverse = s.p < 0; 68 if (reverse) { 69 if (slen + s.p + 1 == 0) { 70 s.delim = s.delim[..0]; 71 s.in = s.in[..0]; 72 } else { 73 const end = (slen + s.p + 1): size - 1; 74 s.in = s.in[..end]; 75 }; 76 s.p = types::I64_MIN; 77 } else { 78 if (s.p == slen) { 79 s.delim = s.delim[..0]; 80 s.in = s.in[..0]; 81 } else { 82 s.in = s.in[s.p: size + 1..]; 83 }; 84 s.p = types::I64_MAX; 85 }; 86 87 return b; 88 }; 89 90 // Returns the next token from a [[tokenizer]] without advancing the cursor. 91 export fn peek_token(s: *tokenizer) ([]u8 | done) = { 92 if (len(s.delim) == 0) { 93 return done; 94 }; 95 96 const reverse = s.p < 0; 97 const ifunc = if (reverse) &rindex else &index; 98 99 const known = ((reverse && s.p != types::I64_MIN) || 100 (!reverse && s.p != types::I64_MAX)); 101 if (!known) { 102 let i = if (reverse) types::I64_MIN else types::I64_MAX; 103 let dlen = 0i64; 104 const slen = len(s.in): i64; 105 106 for (let d .. s.delim) { 107 match (ifunc(s.in, d)) { 108 case let ix: size => 109 if (!reverse && ix: i64 < i) { 110 i = ix: i64; 111 dlen = 1; 112 } else if (reverse && ix: i64 > i) { 113 i = ix: i64; 114 dlen = 1; 115 }; 116 case void => 117 if (!reverse && slen < i: i64) { 118 i = slen; 119 } else if (reverse && 0 > i: i64) { 120 i = 0; 121 }; 122 }; 123 }; 124 125 if (reverse) { 126 if (i == slen) { 127 s.p = -(slen + 1); 128 } else { 129 s.p = i + dlen - slen - 1; 130 }; 131 } else { 132 s.p = i; 133 }; 134 }; 135 136 if (reverse) { 137 return s.in[len(s.in) + s.p: size + 1..]; 138 } else { 139 return s.in[..s.p: size]; 140 }; 141 }; 142 143 // Returns the remainder of the input slice from a [[tokenizer]] ahead of the 144 // token cursor. 145 export fn remaining_tokens(s: *tokenizer) []u8 = { 146 return s.in; 147 }; 148 149 fn tokenize_test( 150 func: *fn([]u8, u8...) tokenizer, 151 testcase: str, 152 in: []u8, 153 delim: []u8, 154 tokens: [][]u8, 155 iters: size = types::SIZE_MAX, 156 ) tokenizer = { 157 const tok = func(in, delim...); 158 let n = 0z; 159 for (const want .. tokens) { 160 if (n >= iters) { 161 return tok; 162 }; 163 n += 1; 164 165 const p = peek_token(&tok) as []u8; 166 const n = next_token(&tok) as []u8; 167 assert(equal(p, n), testcase); 168 assert(equal(n, want), testcase); 169 }; 170 171 if (n >= iters) { 172 return tok; 173 }; 174 175 assert(peek_token(&tok) is done, testcase); 176 assert(next_token(&tok) is done, testcase); 177 return tok; 178 }; 179 180 @test fn tokenize() void = { 181 tokenize_test(&tokenize, "simple case", [1, 2, 0, 3, 4], [0], [ 182 [1, 2], 183 [3, 4], 184 ]); 185 186 tokenize_test(&tokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [ 187 [1, 2], 188 [3, 4], 189 [5, 6], 190 ]); 191 192 tokenize_test(&tokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [ 193 [1, 2], 194 [], 195 [], 196 [3, 4], 197 ]); 198 199 tokenize_test(&tokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [ 200 [], 201 [1, 2, 3], 202 [], 203 ]); 204 205 const tok = tokenize_test(&tokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [ 206 [1, 2], 207 ], 1); 208 assert(equal(remaining_tokens(&tok), [3, 4])); 209 }; 210 211 @test fn rtokenize() void = { 212 tokenize_test(&rtokenize, "simple case", [1, 2, 0, 3, 4], [0], [ 213 [3, 4], 214 [1, 2], 215 ]); 216 217 tokenize_test(&rtokenize, "multiple delimiters", [1, 2, 0, 3, 4, 42, 5, 6], [0, 42], [ 218 [5, 6], 219 [3, 4], 220 [1, 2], 221 ]); 222 223 tokenize_test(&rtokenize, "empty tokens", [1, 2, 0, 0, 0, 3, 4], [0], [ 224 [3, 4], 225 [], 226 [], 227 [1, 2], 228 ]); 229 230 tokenize_test(&rtokenize, "leading empty tokens", [0, 1, 2, 3, 0], [0], [ 231 [], 232 [1, 2, 3], 233 [], 234 ]); 235 236 const tok = tokenize_test(&rtokenize, "remaining_tokens", [1, 2, 0, 3, 4], [0], [ 237 [3, 4], 238 ], 1); 239 assert(equal(remaining_tokens(&tok), [1, 2])); 240 }; 241 242 // Returns the input slice "cut" along the first instance of a delimiter, 243 // returning everything up to the delimiter, and everything after the delimiter, 244 // in a tuple. The contents are borrowed from the input slice. 245 // 246 // The caller must ensure that 'delimiter' is not an empty slice. 247 export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { 248 let ln = if (delim is u8) { 249 yield 1z; 250 } else { 251 let ln = len(delim: []u8); 252 assert(ln > 0, "bytes::cut called with empty delimiter"); 253 yield ln; 254 }; 255 match (index(in, delim)) { 256 case let i: size => 257 return (in[..i], in[i + ln..]); 258 case void => 259 return (in, []); 260 }; 261 }; 262 263 // Returns the input slice "cut" along the last instance of a delimiter, 264 // returning everything up to the delimiter, and everything after the delimiter, 265 // in a tuple. The contents are borrowed from the input slice. 266 // 267 // The caller must ensure that 'delimiter' is not an empty slice. 268 export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { 269 let ln = if (delim is u8) { 270 yield 1z; 271 } else { 272 let ln = len(delim: []u8); 273 assert(ln > 0, "bytes::rcut called with empty delimiter"); 274 yield ln; 275 }; 276 match (rindex(in, delim)) { 277 case let i: size => 278 return (in[..i], in[i + ln..]); 279 case void => 280 return (in, []); 281 }; 282 }; 283 284 @test fn cut() void = { 285 const c = cut(['a', 'b', 'c'], ['b']); 286 assert(equal(c.0, ['a']) && equal(c.1, ['c'])); 287 const c = cut(['a', 'b', 'c'], 'b'); 288 assert(equal(c.0, ['a']) && equal(c.1, ['c'])); 289 const c = cut(['a', 'b', 'c', 'b', 'a'], 'b'); 290 assert(equal(c.0, ['a']) && equal(c.1, ['c', 'b', 'a'])); 291 const c = cut(['a', 'b', 'c'], 'x'); 292 assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, [])); 293 const c = cut([], 'x'); 294 assert(equal(c.0, []) && equal(c.1, [])); 295 296 const c = rcut(['a', 'b', 'c'], ['b']); 297 assert(equal(c.0, ['a']) && equal(c.1, ['c'])); 298 const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b'); 299 assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a'])); 300 };