tokenize.ha (7905B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use types; 5 6 export type tokenizer = struct { 7 s: []u8, // string being tokenized 8 d: []u8, // delimiter 9 p: i64, // p < 0 for reverse tokenizers, 0 <= p for forward ones. 10 }; 11 12 // Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting 13 // at the beginning of the slice. The caller must ensure that 'delim' is not an 14 // empty slice. Can tokenize a slice of length less than [[types::I64_MAX]]. 15 export fn tokenize(s: []u8, delim: []u8) tokenizer = { 16 assert(len(delim) > 0, "bytes::tokenize called with empty slice"); 17 if (len(s) == 0) { 18 delim = []; 19 }; 20 return tokenizer { 21 s = s, 22 d = delim, 23 p = types::I64_MAX, // I64_MAX means we haven't peeked the next token yet. 24 }; 25 }; 26 27 // Returns a tokenizer which yields sub-slices tokenized by a delimiter, starting at 28 // the end of the slice and moving backwards with each call to [[next_token]]. The 29 // caller must ensure that 'delimiter' is not an empty slice. Can tokenize a slice 30 // of length less than [[types::I64_MAX]]. 31 export fn rtokenize(s: []u8, delim: []u8) tokenizer = { 32 assert(len(delim) > 0, "bytes::rtokenize called with empty slice"); 33 if (len(s) == 0) { 34 delim = []; 35 }; 36 return tokenizer { 37 s = s, 38 d = delim, 39 p = types::I64_MIN, // I64_MIN means we haven't peeked the next token yet. 40 // also note that p == -1 corresponds to an index of len(s), 41 // and p == -(1 - len(s)) corresponds to an index of 0. 42 }; 43 }; 44 45 // Returns the next slice from a tokenizer, and advances the cursor. Returns 46 // void if there are no tokens left and on all subsequent invocations. If a 47 // string starts with, or ends with, a token, an empty slice is returned at the 48 // beginning or end of the sequence, respectively. 49 export fn next_token(s: *tokenizer) ([]u8 | void) = { 50 const b = match (peek_token(s)) { 51 case let b: []u8 => 52 yield b; 53 case => return; 54 }; 55 56 if (s.p < 0) { // reverse 57 if (len(s.s): i64 + s.p + 1 == 0) { 58 s.d = s.d[..0]; 59 s.s = s.s[..0]; 60 } else { 61 const end = (len(s.s): i64 + s.p + 1): size - len(s.d); 62 s.s = s.s[..end]; 63 }; 64 s.p = types::I64_MIN; 65 } else { 66 if (s.p == len(s.s): i64) { 67 s.d = s.d[..0]; 68 s.s = s.s[..0]; 69 } else { 70 s.s = s.s[s.p: size + len(s.d)..]; 71 }; 72 s.p = types::I64_MAX; 73 }; 74 75 return b; 76 }; 77 78 // Same as [[next_token]], but does not advance the cursor 79 export fn peek_token(s: *tokenizer) ([]u8 | void) = { 80 if (len(s.d) == 0) { 81 return; 82 }; 83 84 const reverse = s.p < 0; 85 const ifunc = if (reverse) &rindex else &index; 86 87 const known = ((s.p < 0 && s.p != types::I64_MIN) || 88 (s.p >= 0 && s.p != types::I64_MAX)); 89 if (!known) { 90 let i = 0i64; 91 let dlen = 0i64; 92 let slen = len(s.s): i64; 93 94 match (ifunc(s.s, s.d)) { 95 case let ix: size => 96 dlen = len(s.d): i64; 97 i = ix: i64; 98 case void => 99 i = slen; 100 }; 101 102 if (reverse) { 103 if (i == slen) { 104 s.p = -(slen + 1); 105 } else { 106 s.p = i + dlen - slen - 1; 107 }; 108 } else { 109 s.p = i; 110 }; 111 }; 112 113 if (reverse) { 114 return s.s[len(s.s) + s.p: size + 1..]; 115 } else { 116 return s.s[..s.p: size]; 117 }; 118 }; 119 120 121 // Returns the remainder of the slice associated with a tokenizer, without doing 122 // any further tokenization. 123 export fn remaining_tokens(s: *tokenizer) []u8 = { 124 return s.s; 125 }; 126 127 @test fn tokenize() void = { 128 const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5]; 129 let t = tokenize(input, [24, 42]); 130 let p = peek_token(&t) as []u8; 131 let n = next_token(&t) as []u8; 132 assert(equal(p, n)); 133 assert(equal([1, 2], n)); 134 let p = peek_token(&t) as []u8; 135 let n = next_token(&t) as []u8; 136 assert(equal(p, n)); 137 assert(equal([3, 24], n)); 138 assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); 139 assert(equal([4, 5], next_token(&t) as []u8)); 140 assert(peek_token(&t) is void); 141 assert(next_token(&t) is void); 142 143 const input: [_]u8 = [24, 42, 1, 24, 42]; 144 t = tokenize(input, [24, 42]); 145 assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); 146 assert(equal([], next_token(&t) as []u8)); 147 assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); 148 assert(equal([1], next_token(&t) as []u8)); 149 assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); 150 assert(equal([], next_token(&t) as []u8)); 151 assert(peek_token(&t) is void); 152 assert(next_token(&t) is void); 153 154 const input: [_]u8 = [1, 1, 1, 2, 1, 1, 2, 2]; 155 t = tokenize(input, [1, 2]); 156 assert(equal([1, 1], next_token(&t) as []u8)); 157 assert(equal([1], next_token(&t) as []u8)); 158 assert(equal([2], next_token(&t) as []u8)); 159 assert(next_token(&t) is void); 160 161 const input: [_]u8 = [1, 2]; 162 t = tokenize(input, [1, 2]); 163 assert(equal([], next_token(&t) as []u8)); 164 assert(equal([], next_token(&t) as []u8)); 165 assert(peek_token(&t) is void); 166 assert(next_token(&t) is void); 167 168 const input: [_]u8 = [24, 42, 1, 24, 42, 2, 3, 4]; 169 t = tokenize(input, [24, 42]); 170 assert(equal([], next_token(&t) as []u8)); 171 assert(equal([1], next_token(&t) as []u8)); 172 assert(equal(remaining_tokens(&t), [2, 3, 4])); 173 assert(equal(peek_token(&t) as []u8, [2, 3, 4])); 174 assert(equal(remaining_tokens(&t), [2, 3, 4])); 175 176 t = tokenize([], [42]); 177 assert(peek_token(&t) is void); 178 assert(next_token(&t) is void); 179 180 const input: [_]u8 = [1, 2, 24, 42, 3, 24, 24, 42, 4, 5]; 181 let t = rtokenize(input, [24, 42]); 182 let p = peek_token(&t) as []u8; 183 let n = next_token(&t) as []u8; 184 assert(equal(p, n)); 185 assert(equal([4, 5], n)); 186 let p = peek_token(&t) as []u8; 187 let n = next_token(&t) as []u8; 188 assert(equal(p, n)); 189 assert(equal([3, 24], n)); 190 assert(equal(peek_token(&t) as []u8, peek_token(&t) as []u8)); 191 assert(equal([1, 2], next_token(&t) as []u8)); 192 assert(peek_token(&t) is void); 193 assert(next_token(&t) is void); 194 195 const input: [_]u8 = [1, 2, 3, 24, 42, 4, 24, 42]; 196 t = rtokenize(input, [24, 42]); 197 assert(equal([], next_token(&t) as []u8)); 198 assert(equal([4], next_token(&t) as []u8)); 199 assert(equal(remaining_tokens(&t), [1, 2, 3])); 200 assert(equal(peek_token(&t) as []u8, [1, 2, 3])); 201 assert(equal(remaining_tokens(&t), [1, 2, 3])); 202 }; 203 204 // Returns the input slice "cut" along the first instance of a delimiter, 205 // returning everything up to the delimiter, and everything after the delimiter, 206 // in a tuple. The contents are borrowed from the input slice. 207 // 208 // The caller must ensure that 'delimiter' is not an empty slice. 209 export fn cut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { 210 let ln = if (delim is u8) { 211 yield 1z; 212 } else { 213 let ln = len(delim: []u8); 214 assert(ln > 0, "bytes::cut called with empty delimiter"); 215 yield ln; 216 }; 217 match (index(in, delim)) { 218 case let i: size => 219 return (in[..i], in[i + ln..]); 220 case void => 221 return (in, []); 222 }; 223 }; 224 225 // Returns the input slice "cut" along the last instance of a delimiter, 226 // returning everything up to the delimiter, and everything after the delimiter, 227 // in a tuple. The contents are borrowed from the input slice. 228 // 229 // The caller must ensure that 'delimiter' is not an empty slice. 230 export fn rcut(in: []u8, delim: ([]u8 | u8)) ([]u8, []u8) = { 231 let ln = if (delim is u8) { 232 yield 1z; 233 } else { 234 let ln = len(delim: []u8); 235 assert(ln > 0, "bytes::rcut called with empty delimiter"); 236 yield ln; 237 }; 238 match (rindex(in, delim)) { 239 case let i: size => 240 return (in[..i], in[i + ln..]); 241 case void => 242 return (in, []); 243 }; 244 }; 245 246 @test fn cut() void = { 247 const c = cut(['a', 'b', 'c'], ['b']); 248 assert(equal(c.0, ['a']) && equal(c.1, ['c'])); 249 const c = cut(['a', 'b', 'c'], 'b'); 250 assert(equal(c.0, ['a']) && equal(c.1, ['c'])); 251 const c = cut(['a', 'b', 'c', 'b', 'a'], 'b'); 252 assert(equal(c.0, ['a']) && equal(c.1, ['c', 'b', 'a'])); 253 const c = cut(['a', 'b', 'c'], 'x'); 254 assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, [])); 255 const c = cut([], 'x'); 256 assert(equal(c.0, []) && equal(c.1, [])); 257 258 const c = rcut(['a', 'b', 'c'], ['b']); 259 assert(equal(c.0, ['a']) && equal(c.1, ['c'])); 260 const c = rcut(['a', 'b', 'c', 'b', 'a'], 'b'); 261 assert(equal(c.0, ['a', 'b', 'c']) && equal(c.1, ['a'])); 262 };