hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

lex.ha (7264B)


      1 // License: MPL-2.0
      2 // (c) 2022 Drew DeVault <sir@cmpwn.com>
      3 use ascii;
      4 use bufio;
      5 use encoding::utf8;
      6 use io;
      7 use os;
      8 use strconv;
      9 use strings;
     10 use strio;
     11 
     12 export type lexer = struct {
     13 	src: io::handle,
     14 	buffer: []u8,
     15 	strbuf: strio::stream,
     16 	un: (token | void),
     17 	rb: (rune | void),
     18 	loc: (uint, uint),
     19 	prevloc: (uint, uint),
     20 	nextloc: (uint, uint),
     21 	prevrloc: (uint, uint),
     22 };
     23 
     24 // Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and
     25 // should pass the result to [[close]] when they're done with it.
     26 export fn newlexer(src: io::handle) lexer = {
     27 	let buf: []u8 = alloc([0...], os::BUFSIZ);
     28 	return lexer {
     29 		src = src,
     30 		buffer = buf,
     31 		strbuf = strio::dynamic(),
     32 		un = void,
     33 		rb = void,
     34 		loc = (1, 0),
     35 		...
     36 	};
     37 };
     38 
     39 // Frees state associated with a JSON lexer.
     40 export fn close(lex: *lexer) void = {
     41 	free(lex.buffer);
     42 };
     43 
     44 // Returns the next token from a JSON lexer. The return value is borrowed from
     45 // the lexer and will be overwritten on subsequent calls.
     46 export fn lex(lex: *lexer) (token | io::EOF | error) = {
     47 	match (lex.un) {
     48 	case void =>
     49 		lex.prevloc = lex.loc;
     50 	case let tok: token =>
     51 		lex.un = void;
     52 		lex.prevloc = lex.loc;
     53 		lex.loc = lex.nextloc;
     54 		return tok;
     55 	};
     56 
     57 	const rn = match (nextrunews(lex)?) {
     58 	case io::EOF =>
     59 		return io::EOF;
     60 	case let rn: rune =>
     61 		yield rn;
     62 	};
     63 
     64 	switch (rn) {
     65 	case '[' =>
     66 		return arraystart;
     67 	case ']' =>
     68 		return arrayend;
     69 	case '{' =>
     70 		return objstart;
     71 	case '}' =>
     72 		return objend;
     73 	case ',' =>
     74 		return comma;
     75 	case ':' =>
     76 		return colon;
     77 	case '"' =>
     78 		return scan_str(lex)?;
     79 	case =>
     80 		yield;
     81 	};
     82 
     83 	if (ascii::isdigit(rn) || rn == '-') {
     84 		unget(lex, rn);
     85 		return scan_number(lex)?;
     86 	};
     87 
     88 	if (!ascii::isalpha(rn)) {
     89 		return lex.loc: invalid;
     90 	};
     91 
     92 	unget(lex, rn);
     93 	const word = scan_word(lex)?;
     94 	switch (word) {
     95 	case "true" =>
     96 		return true;
     97 	case "false" =>
     98 		return false;
     99 	case "null" =>
    100 		return _null;
    101 	case =>
    102 		return lex.loc: invalid;
    103 	};
    104 };
    105 
    106 // "Unlexes" a token from the lexer, such that the next call to [[lex]] will
    107 // return that token again. Only one token can be unlexed at a time, otherwise
    108 // the program will abort.
    109 export fn unlex(lex: *lexer, tok: token) void = {
    110 	assert(lex.un is void, "encoding::json::unlex called twice in a row");
    111 	lex.un = tok;
    112 	lex.nextloc = lex.loc;
    113 	lex.loc = lex.prevloc;
    114 };
    115 
    116 // Scans until encountering a non-alphabetical character, returning the
    117 // resulting word.
    118 fn scan_word(lex: *lexer) (str | error) = {
    119 	strio::reset(&lex.strbuf);
    120 
    121 	for (true) {
    122 		const rn = match (nextrune(lex)?) {
    123 		case let rn: rune =>
    124 			yield rn;
    125 		case io::EOF =>
    126 			break;
    127 		};
    128 		if (!ascii::isalpha(rn)) {
    129 			unget(lex, rn);
    130 			break;
    131 		};
    132 		strio::appendrune(&lex.strbuf, rn)!;
    133 	};
    134 
    135 	return strio::string(&lex.strbuf);
    136 };
    137 
    138 type numstate = enum {
    139 	SIGN,
    140 	START,
    141 	ZERO,
    142 	INTEGER,
    143 	FRACSTART,
    144 	FRACTION,
    145 	EXPSIGN,
    146 	EXPSTART,
    147 	EXPONENT,
    148 };
    149 
    150 fn scan_number(lex: *lexer) (token | error) = {
    151 	strio::reset(&lex.strbuf);
    152 
    153 	let state = numstate::SIGN;
    154 	for (true) {
    155 		const rn = match (nextrune(lex)?) {
    156 		case let rn: rune =>
    157 			yield rn;
    158 		case io::EOF =>
    159 			break;
    160 		};
    161 
    162 		switch (state) {
    163 		case numstate::SIGN =>
    164 			state = numstate::START;
    165 			if (rn != '-') {
    166 				unget(lex, rn);
    167 				continue;
    168 			};
    169 		case numstate::START =>
    170 			switch (rn) {
    171 			case '0' =>
    172 				state = numstate::ZERO;
    173 			case =>
    174 				if (!ascii::isdigit(rn)) {
    175 					return lex.loc: invalid;
    176 				};
    177 				state = numstate::INTEGER;
    178 			};
    179 		case numstate::ZERO =>
    180 			switch (rn) {
    181 			case '.' =>
    182 				state = numstate::FRACSTART;
    183 			case 'e', 'E' =>
    184 				state = numstate::EXPSIGN;
    185 			case =>
    186 				if (ascii::isdigit(rn)) {
    187 					return lex.loc: invalid;
    188 				};
    189 				unget(lex, rn);
    190 				break;
    191 			};
    192 		case numstate::INTEGER =>
    193 			switch (rn) {
    194 			case '.' =>
    195 				state = numstate::FRACSTART;
    196 			case 'e', 'E' =>
    197 				state = numstate::EXPSIGN;
    198 			case =>
    199 				if (!ascii::isdigit(rn)) {
    200 					unget(lex, rn);
    201 					break;
    202 				};
    203 			};
    204 		case numstate::FRACSTART =>
    205 			if (!ascii::isdigit(rn)) {
    206 				return lex.loc: invalid;
    207 			};
    208 			state = numstate::FRACTION;
    209 		case numstate::FRACTION =>
    210 			switch (rn) {
    211 			case 'e', 'E' =>
    212 				state = numstate::EXPSIGN;
    213 			case =>
    214 				if (!ascii::isdigit(rn)) {
    215 					unget(lex, rn);
    216 					break;
    217 				};
    218 			};
    219 		case numstate::EXPSIGN =>
    220 			state = numstate::EXPSTART;
    221 			if (rn != '+' && rn != '-') {
    222 				unget(lex, rn);
    223 				continue;
    224 			};
    225 		case numstate::EXPSTART =>
    226 			if (!ascii::isdigit(rn)) {
    227 				return lex.loc: invalid;
    228 			};
    229 			state = numstate::EXPONENT;
    230 		case numstate::EXPONENT =>
    231 			if (!ascii::isdigit(rn)) {
    232 				unget(lex, rn);
    233 				break;
    234 			};
    235 		};
    236 
    237 		strio::appendrune(&lex.strbuf, rn)!;
    238 	};
    239 
    240 	match (strconv::stof64(strio::string(&lex.strbuf))) {
    241 	case let f: f64 =>
    242 		return f;
    243 	case =>
    244 		return lex.loc: invalid;
    245 	};
    246 };
    247 
    248 fn scan_str(lex: *lexer) (token | error) = {
    249 	strio::reset(&lex.strbuf);
    250 
    251 	for (true) {
    252 		const rn = match (nextrune(lex)?) {
    253 		case let rn: rune =>
    254 			yield rn;
    255 		case io::EOF =>
    256 			lex.loc.1 += 1;
    257 			return lex.loc: invalid;
    258 		};
    259 
    260 		switch (rn) {
    261 		case '"' =>
    262 			break;
    263 		case '\\' =>
    264 			const rn = scan_escape(lex)?;
    265 			strio::appendrune(&lex.strbuf, rn)!;
    266 		case =>
    267 			if (iscntrl(rn)) {
    268 				return lex.loc: invalid;
    269 			};
    270 			strio::appendrune(&lex.strbuf, rn)!;
    271 		};
    272 	};
    273 
    274 	return strio::string(&lex.strbuf);
    275 };
    276 
    277 fn scan_escape(lex: *lexer) (rune | error) = {
    278 	const rn = match (nextrune(lex)?) {
    279 	case let rn: rune =>
    280 		yield rn;
    281 	case io::EOF =>
    282 		return lex.loc: invalid;
    283 	};
    284 
    285 	switch (rn) {
    286 	case '\"' =>
    287 		return '\"';
    288 	case '\\' =>
    289 		return '\\';
    290 	case '/' =>
    291 		return '/';
    292 	case 'b' =>
    293 		return '\b';
    294 	case 'f' =>
    295 		return '\f';
    296 	case 'n' =>
    297 		return '\n';
    298 	case 'r' =>
    299 		return '\r';
    300 	case 't' =>
    301 		return '\t';
    302 	case 'u' =>
    303 		let buf: [4]u8 = [0...];
    304 		match (io::readall(lex.src, buf)?) {
    305 		case io::EOF =>
    306 			return lex.loc: invalid;
    307 		case size =>
    308 			yield;
    309 		};
    310 		const s = match (strings::try_fromutf8(buf)) {
    311 		case let s: str =>
    312 			yield s;
    313 		case =>
    314 			return lex.loc: invalid;
    315 		};
    316 		match (strconv::stou32b(s, strconv::base::HEX)) {
    317 		case let u: u32 =>
    318 			lex.loc.1 += 4;
    319 			return u: rune;
    320 		case =>
    321 			return lex.loc: invalid;
    322 		};
    323 	case =>
    324 		return lex.loc: invalid;
    325 	};
    326 };
    327 
    328 // Gets the next rune from the lexer.
    329 fn nextrune(lex: *lexer) (rune | io::EOF | error) = {
    330 	if (lex.rb is rune) {
    331 		lex.prevrloc = lex.loc;
    332 		const r = lex.rb as rune;
    333 		lex.rb = void;
    334 		if (r == '\n') {
    335 			lex.loc = (lex.loc.0 + 1, 0);
    336 		} else {
    337 			lex.loc.1 += 1;
    338 		};
    339 		return r;
    340 	};
    341 	match (bufio::scanrune(lex.src)) {
    342 	case let err: io::error =>
    343 		return err;
    344 	case utf8::invalid =>
    345 		return lex.loc: invalid;
    346 	case io::EOF =>
    347 		return io::EOF;
    348 	case let rn: rune =>
    349 		lex.prevrloc = lex.loc;
    350 		if (rn == '\n') {
    351 			lex.loc = (lex.loc.0 + 1, 0);
    352 		} else {
    353 			lex.loc.1 += 1;
    354 		};
    355 		return rn;
    356 	};
    357 };
    358 
    359 // Like nextrune but skips whitespace.
    360 fn nextrunews(lex: *lexer) (rune | io::EOF | error) = {
    361 	for (true) {
    362 		match (nextrune(lex)?) {
    363 		case let rn: rune =>
    364 			if (isspace(rn)) {
    365 				continue;
    366 			};
    367 			return rn;
    368 		case io::EOF =>
    369 			return io::EOF;
    370 		};
    371 	};
    372 	abort(); // Unreachable
    373 };
    374 
    375 fn unget(lex: *lexer, r: rune) void = {
    376 	assert(lex.rb is void);
    377 	lex.rb = r;
    378 	lex.loc = lex.prevrloc;
    379 };
    380 
    381 fn iscntrl(r: rune) bool = r: u32 < 0x20;
    382 
    383 fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';