scanner.ha (9145B)
1 // SPDX-License-Identifier: MPL-2.0 2 // (c) Hare authors <https://harelang.org> 3 4 use bytes; 5 use encoding::utf8; 6 use errors; 7 use io; 8 use strings; 9 use types; 10 11 def BUFSZ: size = 4096; 12 13 const scanner_vtable = io::vtable { 14 reader = &scan_read, 15 ... 16 }; 17 18 export type scanner = struct { 19 stream: io::stream, 20 src: io::handle, 21 buffer: []u8, 22 // Number of bytes available in buffer 23 pending: size, 24 // Number of bytes returned to the user 25 readout: size, 26 // User-confirmed maximum size of read buffer 27 maxread: size, 28 }; 29 30 // Creates a new [[scanner]] which will allocate and maintain a read buffer for 31 // efficient reading of files. The scanner will read ahead only up to maxread 32 // bytes, which defaults to [[types::SIZE_MAX]] if no limit is required. The 33 // user must free resources associated with the scanner using [[finish]] after 34 // use. 35 // 36 // Reads from the scanner will return [[errors::overflow]] if maxread is 37 // reached. 38 export fn newscanner( 39 src: io::handle, 40 maxread: size = types::SIZE_MAX, 41 ) scanner = { 42 return scanner { 43 stream = &scanner_vtable, 44 src = src, 45 buffer = alloc([0...], BUFSZ), 46 maxread = maxread, 47 pending = 0, 48 readout = 0, 49 }; 50 }; 51 52 // Creates a new [[scanner]] using a user-provided buffer. The scanner will 53 // return [[errors::overflow]] if the buffer length is reached, but will not 54 // perform any allocations. The user should not call [[finish]] after use unless 55 // they wish to free the underlying buffer through bufio. 56 export fn newscanner_static(src: io::handle, buffer: []u8) scanner = { 57 return scanner { 58 stream = &scanner_vtable, 59 src = src, 60 buffer = buffer, 61 maxread = len(buffer), 62 pending = 0, 63 readout = 0, 64 }; 65 }; 66 67 // Frees resources associated associated with a [[scanner]]. Does not close the 68 // underlying I/O handle. 69 export fn finish(scan: *scanner) void = { 70 free(scan.buffer); 71 }; 72 73 fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = { 74 let scan = s: *scanner; 75 76 // Consume previous read, if any 77 scan_shift(scan); 78 79 if (scan.pending == 0) { 80 match (scan_readahead(scan)?) { 81 case io::EOF => 82 return io::EOF; 83 case size => void; 84 }; 85 }; 86 87 const n = if (len(buf) > scan.pending) scan.pending else len(buf); 88 buf[..n] = scan_consume(scan, n)[..]; 89 return n; 90 }; 91 92 // Fills up the scanner buffer with data from the underlying I/O handle. If no 93 // space remains in the read buffer, it is expanded by BUFSZ (up to maxread). 94 // Then, one read from the underlying I/O handle is performed and scan.pending 95 // is updated accordingly. Returns the number of bytes which had been available 96 // prior to the call. 97 fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = { 98 if (scan.pending >= len(scan.buffer)) { 99 let readahead = scan.pending + BUFSZ; 100 if (readahead > scan.maxread) { 101 readahead = scan.maxread; 102 }; 103 if (scan.pending >= readahead) { 104 return errors::overflow; 105 }; 106 append(scan.buffer, [0...], readahead); 107 }; 108 109 const prev = scan.pending; 110 match (io::read(scan.src, scan.buffer[scan.pending..])?) { 111 case let z: size => 112 scan.pending += z; 113 return prev; 114 case io::EOF => 115 return io::EOF; 116 }; 117 }; 118 119 // Shifts the buffer towards the start, discarding bytes which were read out. 120 fn scan_shift(scan: *scanner) void = { 121 const n = scan.readout; 122 if (n == 0) { 123 return; 124 }; 125 scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..]; 126 scan.readout = 0; 127 scan.pending -= n; 128 }; 129 130 // Consumes N bytes from the buffer, updating scan.readout. User must call 131 // [[scan_shift]] before calling scan_consume again. 132 fn scan_consume(scan: *scanner, n: size) []u8 = { 133 assert(len(scan.buffer) >= n && scan.readout == 0); 134 scan.readout = n; 135 return scan.buffer[..n]; 136 }; 137 138 // Reads one byte from a [[scanner]]. 139 export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = { 140 // Consume previous read, if any 141 scan_shift(scan); 142 143 if (scan.pending == 0) { 144 match (scan_readahead(scan)?) { 145 case io::EOF => 146 return io::EOF; 147 case size => void; 148 }; 149 }; 150 151 return scan_consume(scan, 1)[0]; 152 }; 153 154 // Reads the next token from a [[scanner]], delimited by delim. The delimiter is 155 // read from the source handle but not included in the returned slice. The 156 // return value is borrowed from the internal scanner buffer, which is 157 // invalidated during subsequent operations which use this scanner. 158 export fn scan_bytes( 159 scan: *scanner, 160 delim: (u8 | []u8), 161 ) ([]u8 | io::EOF | io::error) = { 162 scan_shift(scan); 163 164 let i = 0z, nread = 0z; 165 for (true) { 166 match (bytes::index(scan.buffer[nread..scan.pending], delim)) { 167 case let ix: size => 168 i = ix; 169 break; 170 case void => void; 171 }; 172 173 match (scan_readahead(scan)?) { 174 case io::EOF => 175 if (scan.pending == 0) { 176 return io::EOF; 177 }; 178 return scan_consume(scan, scan.pending); 179 case let prevpending: size => 180 // No need to re-index the earlier part of the buffer 181 nread = prevpending; 182 }; 183 }; 184 185 const ndelim = match (delim) { 186 case u8 => 187 yield 1z; 188 case let u: []u8 => 189 yield len(u); 190 }; 191 const nuser = nread + i, nconsume = nuser + ndelim; 192 return scan_consume(scan, nconsume)[..nuser]; 193 }; 194 195 // Reads one rune from a [[scanner]]. 196 export fn scan_rune( 197 scan: *scanner, 198 ) (rune | io::EOF | io::error | utf8::invalid) = { 199 // Consume previous read, if any 200 scan_shift(scan); 201 202 if (scan.pending == 0) { 203 match (scan_readahead(scan)?) { 204 case io::EOF => 205 if (scan.pending == 0) { 206 return io::EOF; 207 }; 208 case size => void; 209 }; 210 }; 211 const sz = utf8::utf8sz(scan.buffer[0])?; 212 213 for (scan.pending < sz) { 214 match (scan_readahead(scan)?) { 215 case io::EOF => 216 return utf8::invalid; 217 case size => void; 218 }; 219 }; 220 221 const buf = scan_consume(scan, sz); 222 const dec = utf8::decode(buf[..sz]); 223 match (utf8::next(&dec)?) { 224 case let r: rune => 225 return r; 226 case void => 227 return io::EOF; 228 case utf8::more => 229 return utf8::invalid; 230 }; 231 }; 232 233 // Scans a string of text from a [[scanner]] up to some delimiter. The delimiter 234 // is read from the source handle but not included in the returned string. The 235 // return value is borrowed from the internal scanner buffer, which is 236 // invalidated during subsequent operations which use this scanner. 237 export fn scan_string( 238 scan: *scanner, 239 delim: str, 240 ) (const str | io::EOF | io::error | utf8::invalid) = { 241 const token = match (scan_bytes(scan, strings::toutf8(delim))?) { 242 case let token: []u8 => 243 yield token; 244 case io::EOF => 245 return io::EOF; 246 }; 247 return strings::fromutf8(token)?; 248 }; 249 250 // Scans the next line of text from a [[scanner]]. The return value is borrowed 251 // from the internal scanner buffer, which is invalidated during subsequent 252 // operations which use this scanner. 253 export fn scan_line( 254 scan: *scanner, 255 ) (const str | io::EOF | io::error | utf8::invalid) = { 256 return scan_string(scan, "\n"); 257 }; 258 259 // Returns the internal scanner buffer, which contains all bytes read ahead by 260 // the scanner up to this point. 261 export fn scan_buffer(scan: *scanner) []u8 = { 262 scan_shift(scan); 263 return scan.buffer[..scan.pending]; 264 }; 265 266 fn scan_unread(scan: *scanner, buf: []u8) void = { 267 if (len(buf) == 0) { 268 return; 269 }; 270 if (len(buf) <= scan.readout) { 271 scan.buffer[scan.readout - len(buf)..scan.readout] = buf; 272 scan.readout -= len(buf); 273 } else { 274 const n = len(buf) - scan.readout; 275 assert(n < scan.maxread - scan.pending, 276 "Attempted to unread more data than buffer has available"); 277 scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n]; 278 scan.pending += n; 279 scan.buffer[..len(buf)] = buf; 280 scan.readout = 0; 281 }; 282 }; 283 284 // Reads a single byte from an [[io::handle]]. 285 export fn read_byte(file: io::handle) (u8 | io::EOF | io::error) = { 286 let buf: [1]u8 = [0...]; 287 288 match (io::readall(file, buf)?) { 289 case size => 290 return buf[0]; 291 case io::EOF => 292 return io::EOF; 293 }; 294 }; 295 296 // Reads a slice of bytes until the delimiter. Delimiter is not included but 297 // it is read from the file. The return value must be freed by the caller. 298 export fn read_tok(file: io::handle, delim: u8...) ([]u8 | io::EOF | io::error) = { 299 let buf: []u8 = []; 300 301 for (true) { 302 match (read_byte(file)?) { 303 case let res: u8 => 304 if (bytes::contains(delim, res)) { 305 break; 306 }; 307 append(buf, res); 308 case io::EOF => 309 if (len(buf) == 0) { 310 return io::EOF; 311 }; 312 break; 313 }; 314 }; 315 316 return buf; 317 }; 318 319 // Reads a slice of bytes until a newline character (\n, 0x0A). Newline itself 320 // is not included but it is read from the file. The return value must be 321 // freed by the caller. 322 export fn read_line(file: io::handle) ([]u8 | io::EOF | io::error) = 323 read_tok(file, '\n'); 324 325 // Reads a rune from a UTF-8 stream. 326 export fn read_rune( 327 file: io::handle, 328 ) (rune | utf8::invalid | io::EOF | io::error) = { 329 let b: [4]u8 = [0...]; 330 match (io::readall(file, b[..1])?) { 331 case let n: size => void; 332 case io::EOF => 333 return io::EOF; 334 }; 335 336 const sz = utf8::utf8sz(b[0])?; 337 338 if (sz == 1) { 339 return b[0]: rune; 340 }; 341 342 match (io::readall(file, b[1..sz])) { 343 case let n: size => void; 344 case io::EOF => 345 return io::EOF; 346 case let err: io::error => 347 return if (err is io::underread) utf8::invalid else err; 348 }; 349 350 let dec = utf8::decode(b[..sz]); 351 match (utf8::next(&dec)?) { 352 case let r: rune => 353 return r; 354 case void => 355 return io::EOF; 356 case utf8::more => 357 return utf8::invalid; 358 }; 359 };