hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

lex.ha (19407B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 use ascii;
      5 use bufio;
      6 use encoding::utf8;
      7 use fmt;
      8 use io;
      9 use memio;
     10 use os;
     11 use path;
     12 use sort;
     13 use sort::cmp;
     14 use strconv;
     15 use strings;
     16 use types;
     17 
     18 export type lexer = struct {
     19 	in: *bufio::scanner,
     20 	path: str,
     21 	loc: (uint, uint),
     22 	prevrloc: (uint, uint),
     23 	un: token, // ltok::EOF when no token was unlexed
     24 	prevunlocs: [2]((uint, uint), (uint, uint)),
     25 	flags: flag,
     26 	comment: str,
     27 	require_int: bool,
     28 };
     29 
     30 // Flags which apply to this lexer
     31 export type flag = enum uint {
     32 	NONE = 0,
     33 	// Enables lexing comments
     34 	COMMENTS = 1 << 0,
     35 };
     36 
     37 // A syntax error
     38 export type syntax = !(location, str);
     39 
     40 // All possible lexer errors
     41 export type error = !(io::error | syntax);
     42 
     43 // Returns a human-friendly string for a given error. The result may be
     44 // statically allocated.
     45 export fn strerror(err: error) const str = {
     46 	static let buf: [2048]u8 = [0...];
     47 	match (err) {
     48 	case let err: io::error =>
     49 		return io::strerror(err);
     50 	case let s: syntax =>
     51 		return fmt::bsprintf(buf, "{}:{}:{}: syntax error: {}",
     52 			s.0.path, s.0.line, s.0.col, s.1);
     53 	};
     54 };
     55 
     56 // Initializes a new lexer for the given [[bufio::scanner]]. The path is
     57 // borrowed.
     58 export fn init(
     59 	in: *bufio::scanner,
     60 	path: str,
     61 	flags: flag = flag::NONE,
     62 ) lexer = {
     63 	const loc = location { path = path, line = 1, col = 1 };
     64 	return lexer {
     65 		in = in,
     66 		path = path,
     67 		loc = (1, 1),
     68 		prevrloc = (1, 1),
     69 		un = (ltok::EOF, void, loc),
     70 		prevunlocs = [((1, 1), (1, 1))...],
     71 		flags = flags,
     72 		...
     73 	};
     74 };
     75 
     76 // Returns the current value of the comment buffer, or empty string if unset (or
     77 // if [[flag::COMMENTS]] was not enabled for this lexer).
     78 export fn comment(lex: *lexer) str = lex.comment;
     79 
     80 // Returns the next token from the lexer.
     81 export fn lex(lex: *lexer) (token | error) = {
     82 	if (lex.un.0 != ltok::EOF) {
     83 		defer lex.un.0 = ltok::EOF;
     84 		return lex.un;
     85 	};
     86 
     87 	defer {
     88 		lex.prevunlocs[1] = lex.prevunlocs[0];
     89 		const prev = prevloc(lex);
     90 		const loc = mkloc(lex);
     91 		lex.prevunlocs[0] = (
     92 			(prev.line, prev.col),
     93 			(loc.line, loc.col),
     94 		);
     95 	};
     96 
     97 	let r = match (nextw(lex)?) {
     98 	case io::EOF =>
     99 		return (ltok::EOF, void, mkloc(lex));
    100 	case let r: (rune, location) =>
    101 		yield r;
    102 	};
    103 
    104 	if (ascii::isdigit(r.0)) {
    105 		unget(lex, r.0);
    106 		return lex_literal(lex);
    107 	};
    108 
    109 	lex.require_int = false;
    110 	if (is_name(r.0, false)) {
    111 		unget(lex, r.0);
    112 		return lex_name(lex, r.1);
    113 	};
    114 
    115 	let tok = switch (r.0) {
    116 	case '"', '\'', '`' =>
    117 		unget(lex, r.0);
    118 		return lex_rn_str(lex);
    119 	case '.', '<', '>', '&', '|', '^' =>
    120 		unget(lex, r.0);
    121 		return lex3(lex);
    122 	case '*', '%', '/', '+', '-', ':', '!', '=' =>
    123 		unget(lex, r.0);
    124 		return lex2(lex);
    125 	case '~' =>
    126 		yield ltok::BNOT;
    127 	case ',' =>
    128 		yield ltok::COMMA;
    129 	case '{' =>
    130 		yield ltok::LBRACE;
    131 	case '[' =>
    132 		yield ltok::LBRACKET;
    133 	case '(' =>
    134 		yield ltok::LPAREN;
    135 	case '}' =>
    136 		yield ltok::RBRACE;
    137 	case ']' =>
    138 		yield ltok::RBRACKET;
    139 	case ')' =>
    140 		yield ltok::RPAREN;
    141 	case ';' =>
    142 		yield ltok::SEMICOLON;
    143 	case '?' =>
    144 		yield ltok::QUESTION;
    145 	case =>
    146 		return syntaxerr(r.1, "invalid character");
    147 	};
    148 
    149 	line_comment(lex)?;
    150 	return (tok, void, r.1);
    151 };
    152 
    153 fn is_name(r: rune, num: bool) bool =
    154 	ascii::isalpha(r) || r == '_' || r == '@' || (num && ascii::isdigit(r));
    155 
    156 fn lex_unicode(lex: *lexer, loc: location, n: size) (rune | error) = {
    157 	assert(n < 9);
    158 	let buf: [8]u8 = [0...];
    159 	for (let i = 0z; i < n; i += 1z) {
    160 		let r = match (next(lex)?) {
    161 		case io::EOF =>
    162 			return syntaxerr(loc,
    163 				"unexpected EOF scanning for escape");
    164 		case let r: (rune, location) =>
    165 			yield r.0;
    166 		};
    167 		if (!ascii::isxdigit(r)) {
    168 			return syntaxerr(loc,
    169 				"unexpected rune scanning for escape");
    170 		};
    171 		buf[i] = r: u8;
    172 	};
    173 	let s = strings::fromutf8_unsafe(buf[..n]);
    174 	return strconv::stou32(s, strconv::base::HEX) as u32: rune;
    175 };
    176 
    177 fn lex_rune(lex: *lexer, loc: location) (rune | error) = {
    178 	let r = match (next(lex)?) {
    179 	case io::EOF =>
    180 		return syntaxerr(loc, "unexpected EOF scanning for rune");
    181 	case let r: (rune, location) =>
    182 		yield r.0;
    183 	};
    184 	if (r != '\\') {
    185 		return r;
    186 	};
    187 	r = match (next(lex)?) {
    188 	case io::EOF =>
    189 		return syntaxerr(loc, "unexpected EOF scanning for escape");
    190 	case let r: (rune, location) =>
    191 		yield r.0;
    192 	};
    193 	switch (r) {
    194 	case '\\' =>
    195 		return '\\';
    196 	case '\'' =>
    197 		return '\'';
    198 	case '0' =>
    199 		return '\0';
    200 	case 'a' =>
    201 		return '\a';
    202 	case 'b' =>
    203 		return '\b';
    204 	case 'f' =>
    205 		return '\f';
    206 	case 'n' =>
    207 		return '\n';
    208 	case 'r' =>
    209 		return '\r';
    210 	case 't' =>
    211 		return '\t';
    212 	case 'v' =>
    213 		return '\v';
    214 	case '"' =>
    215 		return '\"';
    216 	case 'x' =>
    217 		return lex_unicode(lex, loc, 2);
    218 	case 'u' =>
    219 		return lex_unicode(lex, loc, 4);
    220 	case 'U' =>
    221 		return lex_unicode(lex, loc, 8);
    222 	case =>
    223 		return syntaxerr(mkloc(lex), "unknown escape sequence");
    224 	};
    225 };
    226 
    227 fn lex_string(lex: *lexer, loc: location, delim: rune) (token | error) = {
    228 	let ret: token = (ltok::LIT_STR, "", loc);
    229 	let buf = memio::dynamic();
    230 	for (true) match (next(lex)?) {
    231 	case io::EOF =>
    232 		return syntaxerr(loc, "unexpected EOF scanning string literal");
    233 	case let r: (rune, location) =>
    234 		if (r.0 == delim) break
    235 		else if (delim == '"' && r.0 == '\\') {
    236 			unget(lex, r.0);
    237 			let r = lex_rune(lex, loc)?;
    238 			memio::appendrune(&buf, r)?;
    239 		} else {
    240 			memio::appendrune(&buf, r.0)?;
    241 		};
    242 	};
    243 	for (true) match (nextw(lex)?) {
    244 	case io::EOF =>
    245 		break;
    246 	case let r: (rune, location) =>
    247 		switch (r.0) {
    248 		case '"', '`' =>
    249 			const tok = lex_string(lex, loc, r.0)?;
    250 			const next = tok.1 as str;
    251 			memio::concat(&buf, next)!;
    252 			free(next);
    253 			break;
    254 		case '/' =>
    255 			match (nextw(lex)?) {
    256 			case io::EOF =>
    257 				unget(lex, r.0);
    258 			case let s: (rune, location) =>
    259 				if (s.0 == '/') {
    260 					lex_comment(lex)?;
    261 					continue;
    262 				} else {
    263 					unget(lex, s.0);
    264 					unget(lex, r.0);
    265 				};
    266 			};
    267 			break;
    268 		case =>
    269 			unget(lex, r.0);
    270 			break;
    271 		};
    272 	};
    273 	return (ltok::LIT_STR, memio::string(&buf)!, loc);
    274 };
    275 
    276 fn lex_rn_str(lex: *lexer) (token | error) = {
    277 	const loc = mkloc(lex);
    278 	let r = match (next(lex)) {
    279 	case let r: (rune, location) =>
    280 		yield r.0;
    281 	case (io::EOF | io::error) =>
    282 		abort();
    283 	};
    284 	switch (r) {
    285 	case '\'' => void;
    286 	case '\"', '`' =>
    287 		return lex_string(lex, loc, r);
    288 	case =>
    289 		abort(); // Invariant
    290 	};
    291 
    292 	// Rune literal
    293 	let ret: token = (ltok::LIT_RCONST, lex_rune(lex, loc)?, loc);
    294 	match (next(lex)?) {
    295 	case io::EOF =>
    296 		return syntaxerr(loc, "unexpected EOF");
    297 	case let n: (rune, location) =>
    298 		if (n.0 != '\'') {
    299 			return syntaxerr(n.1, "expected \"\'\"");
    300 		};
    301 	};
    302 	line_comment(lex)?;
    303 	return ret;
    304 };
    305 
    306 fn lex_name(lex: *lexer, loc: location) (token | error) = {
    307 	let buf = memio::dynamic();
    308 	match (next(lex)) {
    309 	case let r: (rune, location) =>
    310 		assert(is_name(r.0, false));
    311 		memio::appendrune(&buf, r.0)!;
    312 	case (io::EOF | io::error) =>
    313 		abort();
    314 	};
    315 
    316 	for (true) match (next(lex)?) {
    317 	case io::EOF => break;
    318 	case let r: (rune, location) =>
    319 		if (!is_name(r.0, true)) {
    320 			unget(lex, r.0);
    321 			break;
    322 		};
    323 		memio::appendrune(&buf, r.0)?;
    324 	};
    325 
    326 	line_comment(lex)?;
    327 
    328 	let n = memio::string(&buf)!;
    329 
    330 	match (sort::search(bmap[..ltok::LAST_KEYWORD+1],
    331 		size(str), &n, &cmp::strs)) {
    332 	case void =>
    333 		return (ltok::NAME, n, loc);
    334 	case let i: size =>
    335 		free(n);
    336 		return (i: ltok, void, loc);
    337 	};
    338 };
    339 
    340 fn line_comment(lex: *lexer) (void | error) = {
    341 	if (lex.flags & flag::COMMENTS != flag::COMMENTS) {
    342 		return;
    343 	};
    344 
    345 	let r: (rune, location) = ('\0', location { ... });
    346 	for (true) match (try(lex, '\t', ' ', '/')?) {
    347 	case void =>
    348 		return;
    349 	case let v: (rune, location) =>
    350 		switch (v.0) {
    351 		case '\t', ' ' => void;
    352 		case '/' =>
    353 			r = v;
    354 			break;
    355 		case => abort(); // unreachable
    356 		};
    357 	};
    358 
    359 	if (try(lex, '/')? is void) {
    360 		unget(lex, r.0);
    361 		return;
    362 	};
    363 
    364 	free(lex.comment);
    365 	lex.comment = "";
    366 	lex_comment(lex)?;
    367 };
    368 
    369 fn lex_comment(lexr: *lexer) (void | error) = {
    370 	if (lexr.flags & flag::COMMENTS != flag::COMMENTS) {
    371 		for (true) match (next(lexr)?) {
    372 		case io::EOF =>
    373 			break;
    374 		case let r: (rune, location) =>
    375 			if (r.0 == '\n') {
    376 				break;
    377 			};
    378 		};
    379 		return;
    380 	};
    381 
    382 	let buf = memio::dynamic();
    383 	defer io::close(&buf)!;
    384 	for (true) match (next(lexr)?) {
    385 	case io::EOF =>
    386 		break;
    387 	case let r: (rune, location) =>
    388 		memio::appendrune(&buf, r.0)!;
    389 		if (r.0 == '\n') {
    390 			break;
    391 		};
    392 	};
    393 	let bytes = strings::toutf8(lexr.comment);
    394 	append(bytes, strings::toutf8(memio::string(&buf)!)...);
    395 	lexr.comment = strings::fromutf8(bytes)!;
    396 };
    397 
    398 fn lex_literal(lex: *lexer) (token | error) = {
    399 	const loc = mkloc(lex);
    400 	let chars: []u8 = [];
    401 	let r = match (next(lex)?) {
    402 	case io::EOF =>
    403 		return (ltok::EOF, void, loc);
    404 	case let r: (rune, location) =>
    405 		yield r;
    406 	};
    407 
    408 	let started = false;
    409 	let base = strconv::base::DEC;
    410 	if (r.0 == '0') {
    411 		append(chars, utf8::encoderune(r.0)...);
    412 		r = match (next(lex)?) {
    413 		case io::EOF =>
    414 			return (ltok::LIT_ICONST, 0u64, loc);
    415 		case let r: (rune, location) =>
    416 			yield r;
    417 		};
    418 		switch (r.0) {
    419 		case 'b' =>
    420 			base = strconv::base::BIN;
    421 		case 'o' =>
    422 			base = strconv::base::OCT;
    423 		case 'x' =>
    424 			base = strconv::base::HEX;
    425 		case =>
    426 			if (ascii::isdigit(r.0)) {
    427 				return syntaxerr(loc,
    428 					"Leading zeros in number literals aren't permitted (for octal, use the 0o prefix instead)");
    429 			};
    430 			started = true;
    431 			unget(lex, r.0);
    432 		};
    433 	} else unget(lex, r.0);
    434 	let basechrs = switch (base) {
    435 	case strconv::base::BIN =>
    436 		yield "01";
    437 	case strconv::base::OCT =>
    438 		yield "01234567";
    439 	case strconv::base::DEC =>
    440 		yield "0123456789";
    441 	case strconv::base::HEX =>
    442 		yield "0123456789ABCDEFabcdef";
    443 	case => abort(); // unreachable
    444 	};
    445 
    446 	let suff: (size | void) = void;
    447 	let exp: (size | void) = void;
    448 	let end = 0z;
    449 	let float = false;
    450 	for (true) {
    451 		r = match (next(lex)?) {
    452 		case io::EOF =>
    453 			break;
    454 		case let r: (rune, location) =>
    455 			yield r;
    456 		};
    457 		if (!strings::contains(basechrs, r.0)) switch (r.0) {
    458 		case '.' =>
    459 			if (!started) {
    460 				return syntaxerr(loc,
    461 					"Expected integer literal");
    462 			};
    463 			if (float || exp is size || suff is size
    464 					|| lex.require_int) {
    465 				unget(lex, r.0);
    466 				break;
    467 			} else {
    468 				r = match (next(lex)?) {
    469 				case io::EOF =>
    470 					break;
    471 				case let r: (rune, location) =>
    472 					yield r;
    473 				};
    474 				if (!strings::contains(basechrs, r.0)) {
    475 					unget(lex, r.0);
    476 					unget(lex, '.');
    477 					break;
    478 				};
    479 				unget(lex, r.0);
    480 				float = true;
    481 				append(chars, utf8::encoderune('.')...);
    482 			};
    483 		case 'e', 'E', 'p', 'P' =>
    484 			if (!started) {
    485 				return syntaxerr(loc,
    486 					"Expected integer literal");
    487 			};
    488 			if ((r.0 == 'e' || r.0 == 'E') !=
    489 					(base == strconv::base::DEC)) {
    490 				unget(lex, r.0);
    491 				break;
    492 			};
    493 			if (exp is size || suff is size) {
    494 				unget(lex, r.0);
    495 				break;
    496 			} else {
    497 				if (end == 0) end = len(chars);
    498 				append(chars, utf8::encoderune(r.0)...);
    499 				exp = len(chars);
    500 				r = match (next(lex)?) {
    501 				case io::EOF =>
    502 					break;
    503 				case let r: (rune, location) =>
    504 					yield r;
    505 				};
    506 				switch (r.0) {
    507 				case '+', '-' =>
    508 					append(chars, utf8::encoderune(r.0)...);
    509 				case =>
    510 					unget(lex, r.0);
    511 				};
    512 				basechrs = "0123456789";
    513 			};
    514 		case 'i', 'u', 'f', 'z' =>
    515 			if (!started) {
    516 				return syntaxerr(loc,
    517 					"Expected integer literal");
    518 			};
    519 			if (suff is size || r.0 != 'f' && float
    520 					|| r.0 == 'f'
    521 					&& base != strconv::base::DEC) {
    522 				unget(lex, r.0);
    523 				break;
    524 			} else {
    525 				suff = len(chars);
    526 				if (end == 0) end = len(chars);
    527 				append(chars, utf8::encoderune(r.0)...);
    528 				basechrs = "0123456789";
    529 			};
    530 		case =>
    531 			unget(lex, r.0);
    532 			break;
    533 		} else append(chars, utf8::encoderune(r.0)...);
    534 		started = true;
    535 	};
    536 	if (!started) {
    537 		return syntaxerr(loc, "expected integer literal");
    538 	};
    539 	if (end == 0) end = len(chars);
    540 	lex.require_int = false;
    541 
    542 	let exp = match (exp) {
    543 	case void =>
    544 		yield "0";
    545 	case let exp: size =>
    546 		let end = match (suff) {
    547 		case void =>
    548 			yield len(chars);
    549 		case let suff: size =>
    550 			yield suff;
    551 		};
    552 		yield strings::fromutf8(chars[exp..end])!;
    553 	};
    554 	let exp = match (strconv::stoi(exp)) {
    555 	case let exp: int =>
    556 		yield exp;
    557 	case strconv::invalid =>
    558 		return syntaxerr(mkloc(lex), "expected exponent");
    559 	case strconv::overflow =>
    560 		return syntaxerr(loc, "overflow in exponent");
    561 	};
    562 
    563 	let floatend = match (suff) {
    564 	case let suff: size =>
    565 		yield suff;
    566 	case void =>
    567 		yield len(chars);
    568 	};
    569 	let suff = match (suff) {
    570 	case let suff: size =>
    571 		yield strings::fromutf8(chars[suff..])!;
    572 	case void =>
    573 		yield "";
    574 	};
    575 	let (suff, signed) = if (suff == "u8") (ltok::LIT_U8, false)
    576 		else if (suff == "u16") (ltok::LIT_U16, false)
    577 		else if (suff == "u32") (ltok::LIT_U32, false)
    578 		else if (suff == "u64") (ltok::LIT_U64, false)
    579 		else if (suff == "u") (ltok::LIT_UINT, false)
    580 		else if (suff == "z") (ltok::LIT_SIZE, false)
    581 		else if (suff == "i8") (ltok::LIT_I8, true)
    582 		else if (suff == "i16") (ltok::LIT_I16, true)
    583 		else if (suff == "i32") (ltok::LIT_I32, true)
    584 		else if (suff == "i64") (ltok::LIT_I64, true)
    585 		else if (suff == "i") (ltok::LIT_INT, true)
    586 		else if (suff == "" && !float && exp >= 0) (ltok::LIT_ICONST, false)
    587 		else if (suff == "f32") (ltok::LIT_F32, false)
    588 		else if (suff == "f64") (ltok::LIT_F64, false)
    589 		else if (suff == "" && (float || exp < 0)) (ltok::LIT_FCONST, false)
    590 		else return syntaxerr(loc, "invalid literal suffix");
    591 
    592 	let exp = if (exp < 0) switch (suff) {
    593 		case ltok::LIT_F32, ltok::LIT_F64, ltok::LIT_FCONST =>
    594 			yield exp: size;
    595 		case => return syntaxerr(loc,
    596 				"invalid negative exponent of integer");
    597 	} else exp: size;
    598 
    599 	let val = strings::fromutf8(chars[..end])!;
    600 	let val = switch (suff) {
    601 	case ltok::LIT_F32, ltok::LIT_F64, ltok::LIT_FCONST =>
    602 		val = strings::fromutf8(chars[..floatend])!;
    603 		yield strconv::stof64(val, base);
    604 	case =>
    605 		yield strconv::stou64(val, base);
    606 	};
    607 	let val = match (val) {
    608 	case let val: u64 =>
    609 		for (let i = 0z; i < exp; i += 1) {
    610 			let old = val;
    611 			val *= 10;
    612 			if (val / 10 != old) {
    613 				return syntaxerr(loc, "overflow in exponent");
    614 			};
    615 		};
    616 		if (signed && val > types::I64_MIN: u64) {
    617 			return syntaxerr(loc, "overflow in exponent");
    618 		};
    619 		yield val;
    620 	case let val: f64 =>
    621 		yield val;
    622 	case strconv::invalid =>
    623 		abort(); // Shouldn't be lexed in
    624 	case strconv::overflow =>
    625 		return syntaxerr(loc, "literal overflow");
    626 	};
    627 
    628 	line_comment(lex)?;
    629 	return (suff, val, loc);
    630 };
    631 
    632 fn lex2(lexr: *lexer) (token | error) = {
    633 	let first = next(lexr)? as (rune, location);
    634 	let tok: (ltok, [](rune, ltok)) = switch (first.0) {
    635 	case '*' =>
    636 		yield (ltok::TIMES, [('=', ltok::TIMESEQ)]);
    637 	case '%' =>
    638 		yield (ltok::MODULO, [('=', ltok::MODEQ)]);
    639 	case '/' =>
    640 		match (next(lexr)?) {
    641 		case let r: (rune, location) =>
    642 			switch (r.0) {
    643 			case '=' =>
    644 				line_comment(lexr)?;
    645 				return (ltok::DIVEQ, void, first.1);
    646 			case '/' =>
    647 				lex_comment(lexr)?;
    648 				return lex(lexr);
    649 			case =>
    650 				unget(lexr, r.0);
    651 				return (ltok::DIV, void, first.1);
    652 			};
    653 		case io::EOF =>
    654 			return (ltok::DIV, void, first.1);
    655 		};
    656 	case '+' =>
    657 		yield (ltok::PLUS, [('=', ltok::PLUSEQ)]);
    658 	case '-' =>
    659 		yield (ltok::MINUS, [('=', ltok::MINUSEQ)]);
    660 	case ':' =>
    661 		yield (ltok::COLON, [(':', ltok::DOUBLE_COLON)]);
    662 	case '!' =>
    663 		yield (ltok::LNOT, [('=', ltok::NEQUAL)]);
    664 	case '=' =>
    665 		yield (ltok::EQUAL, [('=', ltok::LEQUAL), ('>', ltok::ARROW)]);
    666 	case =>
    667 		return syntaxerr(first.1, "unknown token sequence");
    668 	};
    669 	match (next(lexr)?) {
    670 	case let r: (rune, location) =>
    671 		for (let i = 0z; i < len(tok.1); i += 1) {
    672 			if (tok.1[i].0 == r.0) {
    673 				line_comment(lexr)?;
    674 				return (tok.1[i].1, void, first.1);
    675 			};
    676 		};
    677 		unget(lexr, r.0);
    678 		line_comment(lexr)?;
    679 	case io::EOF => void;
    680 	};
    681 	return (tok.0, void, first.1);
    682 };
    683 
    684 fn lex3(lex: *lexer) (token | error) = {
    685 	let r = next(lex)? as (rune, location);
    686 	let toks = switch (r.0) {
    687 	case '.' =>
    688 		let tok = if (try(lex, '.')? is void) {
    689 			lex.require_int = true;
    690 			yield ltok::DOT;
    691 		} else if (try(lex, '.')? is void) {
    692 			yield ltok::DOUBLE_DOT;
    693 		} else ltok::ELLIPSIS;
    694 		line_comment(lex)?;
    695 		return (tok, void, r.1);
    696 	case '<' =>
    697 		yield [ltok::LESS, ltok::LESSEQ, ltok::LSHIFT, ltok::LSHIFTEQ];
    698 	case '>' =>
    699 		yield [ltok::GT, ltok::GTEQ, ltok::RSHIFT,
    700 			ltok::RSHIFTEQ];
    701 	case '&' =>
    702 		yield [ltok::BAND, ltok::BANDEQ, ltok::LAND, ltok::LANDEQ];
    703 	case '|' =>
    704 		yield [ltok::BOR, ltok::BOREQ, ltok::LOR, ltok::LOREQ];
    705 	case '^' =>
    706 		yield [ltok::BXOR, ltok::BXOREQ, ltok::LXOR, ltok::LXOREQ];
    707 	case =>
    708 		return syntaxerr(r.1, "unknown token sequence");
    709 	};
    710 	let idx = match (try(lex, r.0, '=')?) {
    711 	case void =>
    712 		yield 0; // X
    713 	case let n: (rune, location) =>
    714 		yield switch (n.0) {
    715 		case '=' =>
    716 			yield 1; // X=
    717 		case =>
    718 			yield match (try(lex, '=')?) {
    719 			case void =>
    720 				yield 2; // XX
    721 			case (rune, location) =>
    722 				yield 3; // XX=
    723 			};
    724 		};
    725 	};
    726 	line_comment(lex)?;
    727 	return (toks[idx], void, r.1);
    728 };
    729 
    730 // Unlex a single token. The next call to [[lex]] will return this token. Only one
    731 // unlex is supported at a time; you must call [[lex]] before calling [[unlex]]
    732 // again.
    733 export fn unlex(lex: *lexer, tok: token) void = {
    734 	assert(lex.un.0 == ltok::EOF, "attempted to unlex more than one token");
    735 	lex.un = tok;
    736 };
    737 
    738 fn next(lex: *lexer) ((rune, location) | syntax | io::EOF | io::error) = {
    739 	match (bufio::scan_rune(lex.in)) {
    740 	case let e: (io::EOF | io::error) =>
    741 		return e;
    742 	case let r: rune =>
    743 		const loc = mkloc(lex);
    744 		lexloc(lex, r);
    745 		return (r, loc);
    746 	case utf8::invalid =>
    747 		return syntaxerr(mkloc(lex), "Source file is not valid UTF-8");
    748 	};
    749 };
    750 
    751 fn nextw(lex: *lexer) ((rune, location) | io::EOF | error) = {
    752 	for (true) match (next(lex)?) {
    753 	case io::EOF =>
    754 		return io::EOF;
    755 	case let r: (rune, location) =>
    756 		if (ascii::isspace(r.0)) {
    757 			if (r.0 == '\n') {
    758 				free(lex.comment);
    759 				lex.comment = "";
    760 			};
    761 			continue;
    762 		};
    763 		if (!is_name(r.0, true) && r.0 != '/') {
    764 			free(lex.comment);
    765 			lex.comment = "";
    766 		};
    767 		return r;
    768 	};
    769 };
    770 
    771 fn try(
    772 	lex: *lexer,
    773 	want: rune...
    774 ) ((rune, location) | syntax | void | io::error) = {
    775 	let r = match (next(lex)?) {
    776 	case io::EOF =>
    777 		return;
    778 	case let r: (rune, location) =>
    779 		yield r;
    780 	};
    781 	assert(len(want) > 0);
    782 	for (let i = 0z; i < len(want); i += 1) {
    783 		if (r.0 == want[i]) {
    784 			return r;
    785 		};
    786 	};
    787 	unget(lex, r.0);
    788 };
    789 
    790 fn unget(lex: *lexer, r: rune) void = {
    791 	bufio::unreadrune(lex.in, r);
    792 
    793 	// here, we set the current location to the previous location, then
    794 	// subtract one from the previous location's column. this is always
    795 	// correct, even for tabs and newlines, since a tab or newline will
    796 	// never be ungot after a previous unget call. besides tabs and
    797 	// newlines, the rune will always be a printable ASCII character
    798 	assert(ascii::isprint(r) || r == '\t' || r == '\n');
    799 	assert(r != '\n' || lex.prevrloc.0 == lex.loc.0 - 1);
    800 
    801 	lex.loc = lex.prevrloc;
    802 	lex.prevrloc.1 -= 1;
    803 };
    804 
    805 fn lexloc(lex: *lexer, r: rune) void = {
    806 	lex.prevrloc = lex.loc;
    807 	switch (r) {
    808 	case '\n' =>
    809 		lex.loc.0 += 1;
    810 		lex.loc.1 = 1;
    811 	case '\t' =>
    812 		lex.loc.1 += 8 - lex.loc.1 % 8 + 1;
    813 	case =>
    814 		lex.loc.1 += 1;
    815 	};
    816 };
    817 
    818 export fn mkloc(lex: *lexer) location = {
    819 	const loc = if (lex.un.0 == ltok::EOF) lex.loc
    820 		else lex.prevunlocs[1].1;
    821 	return location {
    822 		path = lex.path,
    823 		line = loc.0,
    824 		col = loc.1,
    825 	};
    826 };
    827 
    828 export fn prevloc(lex: *lexer) location = {
    829 	const loc = if (lex.un.0 == ltok::EOF) lex.prevrloc
    830 		else lex.prevunlocs[1].0;
    831 	return location {
    832 		path = lex.path,
    833 		line = loc.0,
    834 		col = loc.1,
    835 	};
    836 };
    837 
    838 export fn syntaxerr(loc: location, why: str) error = {
    839 	static let buf = path::buffer{...};
    840 	path::set(&buf, loc.path)!;
    841 	loc.path = path::string(&buf);
    842 	return (loc, why);
    843 };