hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

scanner.ha (9145B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use bytes;
      5 use encoding::utf8;
      6 use errors;
      7 use io;
      8 use strings;
      9 use types;
     10 
     11 def BUFSZ: size = 4096;
     12 
     13 const scanner_vtable = io::vtable {
     14 	reader = &scan_read,
     15 	...
     16 };
     17 
     18 export type scanner = struct {
     19 	stream: io::stream,
     20 	src: io::handle,
     21 	buffer: []u8,
     22 	// Number of bytes available in buffer
     23 	pending: size,
     24 	// Number of bytes returned to the user
     25 	readout: size,
     26 	// User-confirmed maximum size of read buffer
     27 	maxread: size,
     28 };
     29 
     30 // Creates a new [[scanner]] which will allocate and maintain a read buffer for
     31 // efficient reading of files. The scanner will read ahead only up to maxread
     32 // bytes, which defaults to [[types::SIZE_MAX]] if no limit is required. The
     33 // user must free resources associated with the scanner using [[finish]] after
     34 // use.
     35 //
     36 // Reads from the scanner will return [[errors::overflow]] if maxread is
     37 // reached.
     38 export fn newscanner(
     39 	src: io::handle,
     40 	maxread: size = types::SIZE_MAX,
     41 ) scanner = {
     42 	return scanner {
     43 		stream = &scanner_vtable,
     44 		src = src,
     45 		buffer = alloc([0...], BUFSZ),
     46 		maxread = maxread,
     47 		pending = 0,
     48 		readout = 0,
     49 	};
     50 };
     51 
     52 // Creates a new [[scanner]] using a user-provided buffer. The scanner will
     53 // return [[errors::overflow]] if the buffer length is reached, but will not
     54 // perform any allocations. The user should not call [[finish]] after use unless
     55 // they wish to free the underlying buffer through bufio.
     56 export fn newscanner_static(src: io::handle, buffer: []u8) scanner = {
     57 	return scanner {
     58 		stream = &scanner_vtable,
     59 		src = src,
     60 		buffer = buffer,
     61 		maxread = len(buffer),
     62 		pending = 0,
     63 		readout = 0,
     64 	};
     65 };
     66 
     67 // Frees resources associated associated with a [[scanner]]. Does not close the
     68 // underlying I/O handle.
     69 export fn finish(scan: *scanner) void = {
     70 	free(scan.buffer);
     71 };
     72 
     73 fn scan_read(s: *io::stream, buf: []u8) (size | io::EOF | io::error) = {
     74 	let scan = s: *scanner;
     75 
     76 	// Consume previous read, if any
     77 	scan_shift(scan);
     78 
     79 	if (scan.pending == 0) {
     80 		match (scan_readahead(scan)?) {
     81 		case io::EOF =>
     82 			return io::EOF;
     83 		case size => void;
     84 		};
     85 	};
     86 
     87 	const n = if (len(buf) > scan.pending) scan.pending else len(buf);
     88 	buf[..n] = scan_consume(scan, n)[..];
     89 	return n;
     90 };
     91 
     92 // Fills up the scanner buffer with data from the underlying I/O handle. If no
     93 // space remains in the read buffer, it is expanded by BUFSZ (up to maxread).
     94 // Then, one read from the underlying I/O handle is performed and scan.pending
     95 // is updated accordingly. Returns the number of bytes which had been available
     96 // prior to the call.
     97 fn scan_readahead(scan: *scanner) (size | io::EOF | io::error) = {
     98 	if (scan.pending >= len(scan.buffer)) {
     99 		let readahead = scan.pending + BUFSZ;
    100 		if (readahead > scan.maxread) {
    101 			readahead = scan.maxread;
    102 		};
    103 		if (scan.pending >= readahead) {
    104 			return errors::overflow;
    105 		};
    106 		append(scan.buffer, [0...], readahead);
    107 	};
    108 
    109 	const prev = scan.pending;
    110 	match (io::read(scan.src, scan.buffer[scan.pending..])?) {
    111 	case let z: size =>
    112 		scan.pending += z;
    113 		return prev;
    114 	case io::EOF =>
    115 		return io::EOF;
    116 	};
    117 };
    118 
    119 // Shifts the buffer towards the start, discarding bytes which were read out.
    120 fn scan_shift(scan: *scanner) void = {
    121 	const n = scan.readout;
    122 	if (n == 0) {
    123 		return;
    124 	};
    125 	scan.buffer[..len(scan.buffer) - n] = scan.buffer[n..];
    126 	scan.readout = 0;
    127 	scan.pending -= n;
    128 };
    129 
    130 // Consumes N bytes from the buffer, updating scan.readout. User must call
    131 // [[scan_shift]] before calling scan_consume again.
    132 fn scan_consume(scan: *scanner, n: size) []u8 = {
    133 	assert(len(scan.buffer) >= n && scan.readout == 0);
    134 	scan.readout = n;
    135 	return scan.buffer[..n];
    136 };
    137 
    138 // Reads one byte from a [[scanner]].
    139 export fn scan_byte(scan: *scanner) (u8 | io::EOF | io::error) = {
    140 	// Consume previous read, if any
    141 	scan_shift(scan);
    142 
    143 	if (scan.pending == 0) {
    144 		match (scan_readahead(scan)?) {
    145 		case io::EOF =>
    146 			return io::EOF;
    147 		case size => void;
    148 		};
    149 	};
    150 
    151 	return scan_consume(scan, 1)[0];
    152 };
    153 
    154 // Reads the next token from a [[scanner]], delimited by delim. The delimiter is
    155 // read from the source handle but not included in the returned slice. The
    156 // return value is borrowed from the internal scanner buffer, which is
    157 // invalidated during subsequent operations which use this scanner.
    158 export fn scan_bytes(
    159 	scan: *scanner,
    160 	delim: (u8 | []u8),
    161 ) ([]u8 | io::EOF | io::error) = {
    162 	scan_shift(scan);
    163 
    164 	let i = 0z, nread = 0z;
    165 	for (true) {
    166 		match (bytes::index(scan.buffer[nread..scan.pending], delim)) {
    167 		case let ix: size =>
    168 			i = ix;
    169 			break;
    170 		case void => void;
    171 		};
    172 
    173 		match (scan_readahead(scan)?) {
    174 		case io::EOF =>
    175 			if (scan.pending == 0) {
    176 				return io::EOF;
    177 			};
    178 			return scan_consume(scan, scan.pending);
    179 		case let prevpending: size =>
    180 			// No need to re-index the earlier part of the buffer
    181 			nread = prevpending;
    182 		};
    183 	};
    184 
    185 	const ndelim = match (delim) {
    186 	case u8 =>
    187 		yield 1z;
    188 	case let u: []u8 =>
    189 		yield len(u);
    190 	};
    191 	const nuser = nread + i, nconsume = nuser + ndelim;
    192 	return scan_consume(scan, nconsume)[..nuser];
    193 };
    194 
    195 // Reads one rune from a [[scanner]].
    196 export fn scan_rune(
    197 	scan: *scanner,
    198 ) (rune | io::EOF | io::error | utf8::invalid) = {
    199 	// Consume previous read, if any
    200 	scan_shift(scan);
    201 
    202 	if (scan.pending == 0) {
    203 		match (scan_readahead(scan)?) {
    204 		case io::EOF =>
    205 			if (scan.pending == 0) {
    206 				return io::EOF;
    207 			};
    208 		case size => void;
    209 		};
    210 	};
    211 	const sz = utf8::utf8sz(scan.buffer[0])?;
    212 
    213 	for (scan.pending < sz) {
    214 		match (scan_readahead(scan)?) {
    215 		case io::EOF =>
    216 			return utf8::invalid;
    217 		case size => void;
    218 		};
    219 	};
    220 
    221 	const buf = scan_consume(scan, sz);
    222 	const dec = utf8::decode(buf[..sz]);
    223 	match (utf8::next(&dec)?) {
    224 	case let r: rune =>
    225 		return r;
    226 	case void =>
    227 		return io::EOF;
    228 	case utf8::more =>
    229 		return utf8::invalid;
    230 	};
    231 };
    232 
    233 // Scans a string of text from a [[scanner]] up to some delimiter. The delimiter
    234 // is read from the source handle but not included in the returned string. The
    235 // return value is borrowed from the internal scanner buffer, which is
    236 // invalidated during subsequent operations which use this scanner.
    237 export fn scan_string(
    238 	scan: *scanner,
    239 	delim: str,
    240 ) (const str | io::EOF | io::error | utf8::invalid) = {
    241 	const token = match (scan_bytes(scan, strings::toutf8(delim))?) {
    242 	case let token: []u8 =>
    243 		yield token;
    244 	case io::EOF =>
    245 		return io::EOF;
    246 	};
    247 	return strings::fromutf8(token)?;
    248 };
    249 
    250 // Scans the next line of text from a [[scanner]]. The return value is borrowed
    251 // from the internal scanner buffer, which is invalidated during subsequent
    252 // operations which use this scanner.
    253 export fn scan_line(
    254 	scan: *scanner,
    255 ) (const str | io::EOF | io::error | utf8::invalid) = {
    256 	return scan_string(scan, "\n");
    257 };
    258 
    259 // Returns the internal scanner buffer, which contains all bytes read ahead by
    260 // the scanner up to this point.
    261 export fn scan_buffer(scan: *scanner) []u8 = {
    262 	scan_shift(scan);
    263 	return scan.buffer[..scan.pending];
    264 };
    265 
    266 fn scan_unread(scan: *scanner, buf: []u8) void = {
    267 	if (len(buf) == 0) {
    268 		return;
    269 	};
    270 	if (len(buf) <= scan.readout) {
    271 		scan.buffer[scan.readout - len(buf)..scan.readout] = buf;
    272 		scan.readout -= len(buf);
    273 	} else {
    274 		const n = len(buf) - scan.readout;
    275 		assert(n < scan.maxread - scan.pending,
    276 			"Attempted to unread more data than buffer has available");
    277 		scan.buffer[n..] = scan.buffer[..len(scan.buffer) - n];
    278 		scan.pending += n;
    279 		scan.buffer[..len(buf)] = buf;
    280 		scan.readout = 0;
    281 	};
    282 };
    283 
    284 // Reads a single byte from an [[io::handle]].
    285 export fn read_byte(file: io::handle) (u8 | io::EOF | io::error) = {
    286 	let buf: [1]u8 = [0...];
    287 
    288 	match (io::readall(file, buf)?) {
    289 	case size =>
    290 		return buf[0];
    291 	case io::EOF =>
    292 		return io::EOF;
    293 	};
    294 };
    295 
    296 // Reads a slice of bytes until the delimiter. Delimiter is not included but
    297 // it is read from the file. The return value must be freed by the caller.
    298 export fn read_tok(file: io::handle, delim: u8...) ([]u8 | io::EOF | io::error) = {
    299 	let buf: []u8 = [];
    300 
    301 	for (true) {
    302 		match (read_byte(file)?) {
    303 		case let res: u8 =>
    304 			if (bytes::contains(delim, res)) {
    305 				break;
    306 			};
    307 			append(buf, res);
    308 		case io::EOF =>
    309 			if (len(buf) == 0) {
    310 				return io::EOF;
    311 			};
    312 			break;
    313 		};
    314 	};
    315 
    316 	return buf;
    317 };
    318 
    319 // Reads a slice of bytes until a newline character (\n, 0x0A). Newline itself
    320 // is not included but it is read from the file. The return value must be
    321 // freed by the caller.
    322 export fn read_line(file: io::handle) ([]u8 | io::EOF | io::error) =
    323 	read_tok(file, '\n');
    324 
    325 // Reads a rune from a UTF-8 stream.
    326 export fn read_rune(
    327 	file: io::handle,
    328 ) (rune | utf8::invalid | io::EOF | io::error) = {
    329 	let b: [4]u8 = [0...];
    330 	match (io::readall(file, b[..1])?) {
    331 	case let n: size => void;
    332 	case io::EOF =>
    333 		return io::EOF;
    334 	};
    335 
    336 	const sz = utf8::utf8sz(b[0])?;
    337 
    338 	if (sz == 1) {
    339 		return b[0]: rune;
    340 	};
    341 
    342 	match (io::readall(file, b[1..sz])) {
    343 	case let n: size => void;
    344 	case io::EOF =>
    345 		return io::EOF;
    346 	case let err: io::error =>
    347 		return if (err is io::underread) utf8::invalid else err;
    348 	};
    349 
    350 	let dec = utf8::decode(b[..sz]);
    351 	match (utf8::next(&dec)?) {
    352 	case let r: rune =>
    353 		return r;
    354 	case void =>
    355 		return io::EOF;
    356 	case utf8::more =>
    357 		return utf8::invalid;
    358 	};
    359 };