lex.ha (7264B)
1 // License: MPL-2.0 2 // (c) 2022 Drew DeVault <sir@cmpwn.com> 3 use ascii; 4 use bufio; 5 use encoding::utf8; 6 use io; 7 use os; 8 use strconv; 9 use strings; 10 use strio; 11 12 export type lexer = struct { 13 src: io::handle, 14 buffer: []u8, 15 strbuf: strio::stream, 16 un: (token | void), 17 rb: (rune | void), 18 loc: (uint, uint), 19 prevloc: (uint, uint), 20 nextloc: (uint, uint), 21 prevrloc: (uint, uint), 22 }; 23 24 // Creates a new JSON lexer. The caller may obtain tokens with [[lex]] and 25 // should pass the result to [[close]] when they're done with it. 26 export fn newlexer(src: io::handle) lexer = { 27 let buf: []u8 = alloc([0...], os::BUFSIZ); 28 return lexer { 29 src = src, 30 buffer = buf, 31 strbuf = strio::dynamic(), 32 un = void, 33 rb = void, 34 loc = (1, 0), 35 ... 36 }; 37 }; 38 39 // Frees state associated with a JSON lexer. 40 export fn close(lex: *lexer) void = { 41 free(lex.buffer); 42 }; 43 44 // Returns the next token from a JSON lexer. The return value is borrowed from 45 // the lexer and will be overwritten on subsequent calls. 46 export fn lex(lex: *lexer) (token | io::EOF | error) = { 47 match (lex.un) { 48 case void => 49 lex.prevloc = lex.loc; 50 case let tok: token => 51 lex.un = void; 52 lex.prevloc = lex.loc; 53 lex.loc = lex.nextloc; 54 return tok; 55 }; 56 57 const rn = match (nextrunews(lex)?) { 58 case io::EOF => 59 return io::EOF; 60 case let rn: rune => 61 yield rn; 62 }; 63 64 switch (rn) { 65 case '[' => 66 return arraystart; 67 case ']' => 68 return arrayend; 69 case '{' => 70 return objstart; 71 case '}' => 72 return objend; 73 case ',' => 74 return comma; 75 case ':' => 76 return colon; 77 case '"' => 78 return scan_str(lex)?; 79 case => 80 yield; 81 }; 82 83 if (ascii::isdigit(rn) || rn == '-') { 84 unget(lex, rn); 85 return scan_number(lex)?; 86 }; 87 88 if (!ascii::isalpha(rn)) { 89 return lex.loc: invalid; 90 }; 91 92 unget(lex, rn); 93 const word = scan_word(lex)?; 94 switch (word) { 95 case "true" => 96 return true; 97 case "false" => 98 return false; 99 case "null" => 100 return _null; 101 case => 102 return lex.loc: invalid; 103 }; 104 }; 105 106 // "Unlexes" a token from the lexer, such that the next call to [[lex]] will 107 // return that token again. Only one token can be unlexed at a time, otherwise 108 // the program will abort. 109 export fn unlex(lex: *lexer, tok: token) void = { 110 assert(lex.un is void, "encoding::json::unlex called twice in a row"); 111 lex.un = tok; 112 lex.nextloc = lex.loc; 113 lex.loc = lex.prevloc; 114 }; 115 116 // Scans until encountering a non-alphabetical character, returning the 117 // resulting word. 118 fn scan_word(lex: *lexer) (str | error) = { 119 strio::reset(&lex.strbuf); 120 121 for (true) { 122 const rn = match (nextrune(lex)?) { 123 case let rn: rune => 124 yield rn; 125 case io::EOF => 126 break; 127 }; 128 if (!ascii::isalpha(rn)) { 129 unget(lex, rn); 130 break; 131 }; 132 strio::appendrune(&lex.strbuf, rn)!; 133 }; 134 135 return strio::string(&lex.strbuf); 136 }; 137 138 type numstate = enum { 139 SIGN, 140 START, 141 ZERO, 142 INTEGER, 143 FRACSTART, 144 FRACTION, 145 EXPSIGN, 146 EXPSTART, 147 EXPONENT, 148 }; 149 150 fn scan_number(lex: *lexer) (token | error) = { 151 strio::reset(&lex.strbuf); 152 153 let state = numstate::SIGN; 154 for (true) { 155 const rn = match (nextrune(lex)?) { 156 case let rn: rune => 157 yield rn; 158 case io::EOF => 159 break; 160 }; 161 162 switch (state) { 163 case numstate::SIGN => 164 state = numstate::START; 165 if (rn != '-') { 166 unget(lex, rn); 167 continue; 168 }; 169 case numstate::START => 170 switch (rn) { 171 case '0' => 172 state = numstate::ZERO; 173 case => 174 if (!ascii::isdigit(rn)) { 175 return lex.loc: invalid; 176 }; 177 state = numstate::INTEGER; 178 }; 179 case numstate::ZERO => 180 switch (rn) { 181 case '.' => 182 state = numstate::FRACSTART; 183 case 'e', 'E' => 184 state = numstate::EXPSIGN; 185 case => 186 if (ascii::isdigit(rn)) { 187 return lex.loc: invalid; 188 }; 189 unget(lex, rn); 190 break; 191 }; 192 case numstate::INTEGER => 193 switch (rn) { 194 case '.' => 195 state = numstate::FRACSTART; 196 case 'e', 'E' => 197 state = numstate::EXPSIGN; 198 case => 199 if (!ascii::isdigit(rn)) { 200 unget(lex, rn); 201 break; 202 }; 203 }; 204 case numstate::FRACSTART => 205 if (!ascii::isdigit(rn)) { 206 return lex.loc: invalid; 207 }; 208 state = numstate::FRACTION; 209 case numstate::FRACTION => 210 switch (rn) { 211 case 'e', 'E' => 212 state = numstate::EXPSIGN; 213 case => 214 if (!ascii::isdigit(rn)) { 215 unget(lex, rn); 216 break; 217 }; 218 }; 219 case numstate::EXPSIGN => 220 state = numstate::EXPSTART; 221 if (rn != '+' && rn != '-') { 222 unget(lex, rn); 223 continue; 224 }; 225 case numstate::EXPSTART => 226 if (!ascii::isdigit(rn)) { 227 return lex.loc: invalid; 228 }; 229 state = numstate::EXPONENT; 230 case numstate::EXPONENT => 231 if (!ascii::isdigit(rn)) { 232 unget(lex, rn); 233 break; 234 }; 235 }; 236 237 strio::appendrune(&lex.strbuf, rn)!; 238 }; 239 240 match (strconv::stof64(strio::string(&lex.strbuf))) { 241 case let f: f64 => 242 return f; 243 case => 244 return lex.loc: invalid; 245 }; 246 }; 247 248 fn scan_str(lex: *lexer) (token | error) = { 249 strio::reset(&lex.strbuf); 250 251 for (true) { 252 const rn = match (nextrune(lex)?) { 253 case let rn: rune => 254 yield rn; 255 case io::EOF => 256 lex.loc.1 += 1; 257 return lex.loc: invalid; 258 }; 259 260 switch (rn) { 261 case '"' => 262 break; 263 case '\\' => 264 const rn = scan_escape(lex)?; 265 strio::appendrune(&lex.strbuf, rn)!; 266 case => 267 if (iscntrl(rn)) { 268 return lex.loc: invalid; 269 }; 270 strio::appendrune(&lex.strbuf, rn)!; 271 }; 272 }; 273 274 return strio::string(&lex.strbuf); 275 }; 276 277 fn scan_escape(lex: *lexer) (rune | error) = { 278 const rn = match (nextrune(lex)?) { 279 case let rn: rune => 280 yield rn; 281 case io::EOF => 282 return lex.loc: invalid; 283 }; 284 285 switch (rn) { 286 case '\"' => 287 return '\"'; 288 case '\\' => 289 return '\\'; 290 case '/' => 291 return '/'; 292 case 'b' => 293 return '\b'; 294 case 'f' => 295 return '\f'; 296 case 'n' => 297 return '\n'; 298 case 'r' => 299 return '\r'; 300 case 't' => 301 return '\t'; 302 case 'u' => 303 let buf: [4]u8 = [0...]; 304 match (io::readall(lex.src, buf)?) { 305 case io::EOF => 306 return lex.loc: invalid; 307 case size => 308 yield; 309 }; 310 const s = match (strings::try_fromutf8(buf)) { 311 case let s: str => 312 yield s; 313 case => 314 return lex.loc: invalid; 315 }; 316 match (strconv::stou32b(s, strconv::base::HEX)) { 317 case let u: u32 => 318 lex.loc.1 += 4; 319 return u: rune; 320 case => 321 return lex.loc: invalid; 322 }; 323 case => 324 return lex.loc: invalid; 325 }; 326 }; 327 328 // Gets the next rune from the lexer. 329 fn nextrune(lex: *lexer) (rune | io::EOF | error) = { 330 if (lex.rb is rune) { 331 lex.prevrloc = lex.loc; 332 const r = lex.rb as rune; 333 lex.rb = void; 334 if (r == '\n') { 335 lex.loc = (lex.loc.0 + 1, 0); 336 } else { 337 lex.loc.1 += 1; 338 }; 339 return r; 340 }; 341 match (bufio::scanrune(lex.src)) { 342 case let err: io::error => 343 return err; 344 case utf8::invalid => 345 return lex.loc: invalid; 346 case io::EOF => 347 return io::EOF; 348 case let rn: rune => 349 lex.prevrloc = lex.loc; 350 if (rn == '\n') { 351 lex.loc = (lex.loc.0 + 1, 0); 352 } else { 353 lex.loc.1 += 1; 354 }; 355 return rn; 356 }; 357 }; 358 359 // Like nextrune but skips whitespace. 360 fn nextrunews(lex: *lexer) (rune | io::EOF | error) = { 361 for (true) { 362 match (nextrune(lex)?) { 363 case let rn: rune => 364 if (isspace(rn)) { 365 continue; 366 }; 367 return rn; 368 case io::EOF => 369 return io::EOF; 370 }; 371 }; 372 abort(); // Unreachable 373 }; 374 375 fn unget(lex: *lexer, r: rune) void = { 376 assert(lex.rb is void); 377 lex.rb = r; 378 lex.loc = lex.prevrloc; 379 }; 380 381 fn iscntrl(r: rune) bool = r: u32 < 0x20; 382 383 fn isspace(r: rune) bool = ascii::isspace(r) && r != '\f';