hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

commit 38f1231600285172e4b71d8f966f217dee120fac
parent 67ef7d2af7a3023cc011e08fa5746807fdada349
Author: Sebastian <sebastian@sebsite.pw>
Date:   Sat, 28 Oct 2023 02:34:30 -0400

hare::parse::doc: rewrite

It's an actual parser now. It also correctly handles everything
documented in hare-doc(5).

Signed-off-by: Sebastian <sebastian@sebsite.pw>

Diffstat:
Mcmd/haredoc/doc/html.ha | 107++++++++++++++++++++++++++++++++++++++++++++-----------------------------------
Mcmd/haredoc/doc/types.ha | 7+++++--
Ahare/parse/doc/+test.ha | 175+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mhare/parse/doc/doc.ha | 459+++++++++++++++++++++++++++++++++++++++++++++++--------------------------------
4 files changed, 510 insertions(+), 238 deletions(-)

diff --git a/cmd/haredoc/doc/html.ha b/cmd/haredoc/doc/html.ha @@ -5,6 +5,7 @@ use encoding::utf8; use fmt; use hare::ast; +use hare::lex; use hare::parse::doc; use hare::unparse; use io; @@ -84,7 +85,11 @@ export fn emit_html(ctx: *context) (void | error) = { case void => void; case let f: io::file => fmt::fprintln(ctx.out, "<div class='readme'>")?; - markup_html(ctx, f)?; + markup_html(ctx, f, lex::location { + path = "README", // XXX: this is meh + line = 1, + col = 1, + })?; fmt::fprintln(ctx.out, "</div>")?; }; @@ -227,7 +232,7 @@ fn details(ctx: *context, decl: ast::decl) (void | error) = { const trimmed = trim_comment(decl.docs); defer free(trimmed); const buf = strings::toutf8(trimmed); - markup_html(ctx, &memio::fixed(buf))?; + markup_html(ctx, &memio::fixed(buf), decl.start)?; } else { fmt::fprintln(ctx.out, "</details>")?; }; @@ -261,7 +266,7 @@ fn htmlref(ctx: *context, ref: ast::ident) (void | error) = { case symkind::MODULE => let ipath = strings::join("/", id...); defer free(ipath); - fmt::fprintf(ctx.out, "<a href='/{}' class='ref'>{}</a>", + fmt::fprintf(ctx.out, "<a href='/{}' class='ref'>{}::</a>", ipath, ident)?; case symkind::SYMBOL => let ipath = strings::join("/", id[..len(id) - 1]...); @@ -280,65 +285,71 @@ fn htmlref(ctx: *context, ref: ast::ident) (void | error) = { free(ident); }; -fn markup_html(ctx: *context, in: io::handle) (void | error) = { - let parser = doc::parse(in); - let waslist = false; - for (true) { - const tok = match (doc::scan(&parser)) { - case void => - if (waslist) { - fmt::fprintln(ctx.out, "</ul>")?; - }; - break; - case let tok: doc::token => - yield tok; - }; - match (tok) { - case doc::paragraph => - if (waslist) { - fmt::fprintln(ctx.out, "</ul>")?; - waslist = false; - }; - fmt::fprintln(ctx.out)?; - fmt::fprint(ctx.out, "<p>")?; - case let tx: doc::text => - defer free(tx); - match (uri::parse(strings::trim(tx))) { +fn html_paragraph(ctx: *context, p: doc::paragraph) (void | error) = { + for (let i = 0z; i < len(p); i += 1) { + match (p[i]) { + case let s: str => + match (uri::parse(s)) { case let uri: uri::uri => defer uri::finish(&uri); - if (uri.host is net::ip::addr || len(uri.host as str) > 0) { + if (uri.host is ip::addr || len(uri.host as str) > 0) { fmt::fprint(ctx.out, "<a rel='nofollow noopener' href='")?; uri::fmt(ctx.out, &uri)?; fmt::fprint(ctx.out, "'>")?; - html_escape(ctx.out, tx)?; + html_escape(ctx.out, s)?; fmt::fprint(ctx.out, "</a>")?; } else { - html_escape(ctx.out, tx)?; + html_escape(ctx.out, s)?; }; case uri::invalid => - html_escape(ctx.out, tx)?; + html_escape(ctx.out, s)?; }; - case let re: doc::reference => - htmlref(ctx, re)?; - case let sa: doc::sample => - if (waslist) { - fmt::fprintln(ctx.out, "</ul>")?; - waslist = false; + case let d: doc::decl_ref => + htmlref(ctx, d)?; + case let m: doc::mod_ref => + htmlref(ctx, m)?; + }; + }; +}; + +fn markup_html( + ctx: *context, + in: io::handle, + loc: lex::location, +) (void | error) = { + const doc = match (doc::parse(in, loc)) { + case let doc: doc::doc => + yield doc; + case let err: lex::syntax => + const err = lex::strerror(err); + fmt::errorfln("Warning:", err)?; + fmt::fprint(ctx.out, "<p class='ref invalid'>Can't parse docs: ")?; + html_escape(ctx.out, err)?; + fmt::fprintln(ctx.out)?; + return; + }; + defer doc::freeall(doc); + + for (let i = 0z; i < len(doc); i += 1) { + match (doc[i]) { + case let p: doc::paragraph => + fmt::fprint(ctx.out, "<p>")?; + html_paragraph(ctx, p)?; + fmt::fprintln(ctx.out)?; + case let l: doc::list => + fmt::fprintln(ctx.out, "<ul>")?; + for (let i = 0z; i < len(l); i += 1) { + fmt::fprint(ctx.out, "<li>")?; + html_paragraph(ctx, l[i])?; + fmt::fprintln(ctx.out)?; }; + fmt::fprintln(ctx.out, "</ul>")?; + case let c: doc::code_sample => fmt::fprint(ctx.out, "<pre class='sample'>")?; - html_escape(ctx.out, sa)?; - fmt::fprint(ctx.out, "</pre>")?; - free(sa); - case doc::listitem => - if (!waslist) { - fmt::fprintln(ctx.out, "<ul>")?; - waslist = true; - }; - fmt::fprint(ctx.out, "<li>")?; + html_escape(ctx.out, c)?; + fmt::fprintln(ctx.out, "</pre>")?; }; }; - fmt::fprintln(ctx.out)?; - return; }; fn syn_centry( diff --git a/cmd/haredoc/doc/types.ha b/cmd/haredoc/doc/types.ha @@ -7,13 +7,14 @@ use hare::ast; use hare::lex; use hare::module; use hare::parse; +use hare::parse::doc; use io; use os::exec; export type haredoc_colors_error = !str; -export type error = !(lex::error | parse::error | io::error | module::error | - exec::error | fs::error | haredoc_colors_error); +export type error = !(lex::error | parse::error | doc::error | io::error | + module::error | exec::error | fs::error | haredoc_colors_error); export fn strerror(err: error) str = { match (err) { @@ -21,6 +22,8 @@ export fn strerror(err: error) str = { return lex::strerror(err); case let err: parse::error => return parse::strerror(err); + case let err: doc::error => + return doc::strerror(err); case let err: io::error => return io::strerror(err); case let err: module::error => diff --git a/hare/parse/doc/+test.ha b/hare/parse/doc/+test.ha @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: MPL-2.0 +// (c) Hare authors <https://harelang.org> + +use fmt; +use hare::ast; +use hare::lex; +use hare::unparse; +use memio; +use os; +use strings; + +fn assert_doc_eq(a: doc, b: doc) void = { + assert(len(a) == len(b)); + for (let i = 0z; i < len(a); i += 1) { + match (a[i]) { + case let a: paragraph => + const b = b[i] as paragraph; + assert_paragraph_eq(a, b); + case let a: list => + const b = b[i] as list; + assert(len(a) == len(b)); + for (let i = 0z; i < len(a); i += 1) { + assert_paragraph_eq(a[i], b[i]); + }; + case let a: code_sample => + const b = b[i] as code_sample; + if (a != b) { + fmt::errorfln("=== wanted code sample\n{}", b)!; + fmt::errorfln("=== got code sample\n{}", a)!; + abort(); + }; + }; + }; +}; + +fn assert_paragraph_eq(a: paragraph, b: paragraph) void = { + fmt::errorln(len(a), len(b))!; + assert(len(a) == len(b)); + for (let i = 0z; i < len(a); i += 1) { + match (a[i]) { + case let a: str => + const b = b[i] as str; + if (a != b) { + fmt::errorfln("=== wanted text\n{}", b)!; + fmt::errorfln("=== got text\n{}", a)!; + abort(); + }; + case let a: decl_ref => + const b = b[i] as decl_ref; + if (!ast::ident_eq(a, b)) { + fmt::error("=== wanted decl_ref ")!; + unparse::ident(os::stderr, b)!; + fmt::error("\n=== got decl_ref ")!; + unparse::ident(os::stderr, a)!; + fmt::errorln()!; + abort(); + }; + case let a: mod_ref => + const b = b[i] as mod_ref; + if (!ast::ident_eq(a, b)) { + fmt::error("=== wanted mod_ref ")!; + unparse::ident(os::stderr, b)!; + fmt::error("\n=== got mod_ref ")!; + unparse::ident(os::stderr, a)!; + fmt::errorln()!; + abort(); + }; + }; + }; +}; + +@test fn doc() void = { + // if you have some way in your editor to distinguish tabs from spaces + // you're gonna want to use it here + let in = memio::fixed(strings::toutf8( +` Blablabla asdfghjkl + qwerty[[uiop::]] zxcvbnm + + new paragraph + - list starting immediately after paragraph + - another list item + - yet another + but this one + spans multiple lines + -no leading space +still multiple lines + + code sample + line 2 + no leading space + + + continuing the same code sample + indentation is preserved + as well as multiple spaces + this is now a paragraph because of the [[leading::spaces]] + + - list starting [[after]] [[empty::line::]] + + but with only [one item]] + - + code sample starting immediately after list with one empty item` + )); + + const doc = parse(&in, lex::location { ... })!; + defer freeall(doc); + + assert_doc_eq(doc, [ + [ + "Blablabla asdfghjkl qwerty", + ["uiop"]: mod_ref, + " zxcvbnm", + ]: paragraph, + + ["new paragraph"]: paragraph, + + [ + ["list starting immediately after paragraph"], + ["another list item"], + ["yet another but this one spans multiple lines"], + ["no leading space still multiple lines"], + ]: list, + + `code sample +line 2 +no leading space + + +continuing the same code sample + indentation is preserved + as well as multiple spaces`: code_sample, + + [ + " this is now a paragraph because of the ", + ["leading", "spaces"]: decl_ref, + ]: paragraph, + + [ + [ + "list starting ", + ["after"]: decl_ref, + " ", + ["empty", "line"]: mod_ref, + ], + ]: list, + + ["but with only [one item]]"]: paragraph, + + [[]]: list, + + "code sample starting immediately after list with one empty item": code_sample, + ]); +}; + +@test fn invalid_ref() void = { + const tests: [_](str, uint, uint) = [ + ("[[abort]]", 1, 3), + ("[[::foo]]", 1, 3), + ("[[]]", 1, 3), + ("[[foo]", 1, 7), + (" \t\n a\n asdf\t [[]]", 3, 12), + ]; + for (let i = 0u; i < len(tests): uint; i += 1) { + let in = memio::fixed(strings::toutf8(tests[i].0)); + fmt::errorln(tests[i].0)!; + const err = parse(&in, lex::location { + path = "<test>", + line = i + 1, + col = i + 1, + }) as lex::syntax; + assert(err.0.path == "<test>"); + assert(err.0.line == i + tests[i].1); + assert(err.0.col == i + tests[i].2); + }; +}; diff --git a/hare/parse/doc/doc.ha b/hare/parse/doc/doc.ha @@ -1,252 +1,335 @@ // SPDX-License-Identifier: MPL-2.0 // (c) Hare authors <https://harelang.org> -use ascii; use bufio; use encoding::utf8; -use fmt; use hare::ast; +use hare::lex; use hare::parse; use io; use memio; use strings; +use types; -export type paragraph = void; -export type text = str; -export type reference = ast::ident; -export type sample = str; -export type listitem = void; -export type token = (paragraph | text | reference | sample | listitem); - -export type docstate = enum { - PARAGRAPH, - TEXT, - LIST, -}; +export type doc = [](paragraph | list | code_sample); -export type parser = struct { - src: bufio::stream, - state: docstate, -}; +export type paragraph = [](str | decl_ref | mod_ref); + +export type list = []paragraph; + +export type code_sample = str; + +export type decl_ref = ast::ident; + +export type mod_ref = ast::ident; + +export type error = !lex::error; -export fn parse(in: io::handle) parser = { - static let buf: [4096]u8 = [0...]; - return parser { - src = bufio::init(in, buf[..], []), - state = docstate::PARAGRAPH, +// Converts an error into a human-friendly string. The result may be statically +// allocated. +export fn strerror(err: error) const str = lex::strerror(err); + +export fn parse(in: io::handle, start: lex::location) (doc | error) = { + let sc = bufio::newscanner(in, types::SIZE_MAX); + defer bufio::finish(&sc); + + match (_parse(&sc)) { + case let doc: doc => + return doc; + case let err: lex::syntax => + err.0.path = start.path; + err.0.line += start.line; + err.0.col += start.col; + return err; + case let err: io::error => + return err; + case utf8::invalid => + // XXX: the location for this error is inaccurate + return lex::syntaxerr(start, "Invalid UTF-8"); }; }; -export fn scan(par: *parser) (token | void) = { - const rn = match (bufio::read_rune(&par.src)!) { - case let rn: rune => - yield rn; - case io::EOF => - return; - }; +fn _parse(sc: *bufio::scanner) (doc | ...error | utf8::invalid) = { + let loc = lex::location { ... }; + let doc: doc = []; - bufio::unreadrune(&par.src, rn); - switch (par.state) { - case docstate::TEXT => - switch (rn) { - case '[' => - return scanref(par); - case => - return scantext(par); - }; - case docstate::LIST => - switch (rn) { - case '[' => - return scanref(par); - case '-' => - return scanlist(par); - case => - return scantext(par); + for (true) match (bufio::scan_rune(sc)?) { + case io::EOF => + break; + case let r: rune => + if (r == ' ') { + r = match (bufio::scan_rune(sc)?) { + case io::EOF => + break; + case let r: rune => + loc.col = 1; + yield r; + }; }; - case docstate::PARAGRAPH => - switch (rn) { - case ' ', '\t' => - return scansample(par); + + switch (r) { + case '\t' => + loc.col = 8; + append(doc, scan_code_sample(sc, &loc)?); + case '\n' => + loc.line += 1; + loc.col = 0; case '-' => - return scanlist(par); + loc.col += 1; + append(doc, scan_list(sc, &loc)?); case => - return scantext(par); + bufio::unreadrune(sc, r); + append(doc, scan_paragraph(sc, &loc)?); }; }; + + return doc; }; -fn scantext(par: *parser) (token | void) = { - if (par.state == docstate::PARAGRAPH) { - par.state = docstate::TEXT; - return paragraph; - }; - // TODO: Collapse whitespace - const buf = memio::dynamic(); - for (true) { - const rn = match (bufio::read_rune(&par.src)!) { - case io::EOF => - break; - case let rn: rune => - yield rn; - }; - switch (rn) { - case '[' => - bufio::unreadrune(&par.src, rn); - break; +fn scan_code_sample( + sc: *bufio::scanner, + loc: *lex::location, +) (code_sample | ...error | utf8::invalid) = { + let s = memio::dynamic(); + for (true) match (bufio::scan_rune(sc)?) { + case io::EOF => + break; + case let r: rune => + switch (r) { + case '\t' => + loc.col += 8 - loc.col % 8; + memio::appendrune(&s, r)!; case '\n' => - memio::appendrune(&buf, rn)!; - const rn = match (bufio::read_rune(&par.src)!) { + loc.line += 1; + loc.col = 0; + + let (r, space) = match (bufio::scan_rune(sc)?) { case io::EOF => break; - case let rn: rune => - yield rn; - }; - if (rn == '\n') { - par.state = docstate::PARAGRAPH; - break; + case let r: rune => + if (r != ' ') yield (r, false); + yield match (bufio::scan_rune(sc)?) { + case io::EOF => + break; + case let r: rune => + yield (r, true); + }; }; - bufio::unreadrune(&par.src, rn); - if (rn == '-' && par.state == docstate::LIST) { + + switch (r) { + case '\t' => + loc.col = 8; + memio::appendrune(&s, '\n')!; + case '\n' => + memio::appendrune(&s, '\n')!; + bufio::unreadrune(sc, '\n'); + case => + bufio::unreadrune(sc, r); + if (space) { + bufio::unreadrune(sc, ' '); + }; break; }; case => - memio::appendrune(&buf, rn)!; + loc.col += 1; + memio::appendrune(&s, r)!; }; }; - let result = memio::string(&buf)!; - if (len(result) == 0) { - return; - }; - return result: text; -}; -fn scanref(par: *parser) (token | void) = { - match (bufio::read_rune(&par.src)!) { - case io::EOF => - return; - case let rn: rune => - if (rn != '[') { - abort(); - }; - }; - match (bufio::read_rune(&par.src)!) { - case io::EOF => - return; - case let rn: rune => - if (rn != '[') { - bufio::unreadrune(&par.src, rn); - return strings::dup("["): text; - }; - }; + return memio::string(&s)!; +}; - const buf = memio::dynamic(); - defer io::close(&buf)!; - // TODO: Handle invalid syntax here +fn scan_list( + sc: *bufio::scanner, + loc: *lex::location, +) (list | ...error | utf8::invalid) = { + let li: list = []; for (true) { - match (bufio::read_rune(&par.src)!) { - case let rn: rune => - switch (rn) { - case ']' => - bufio::read_rune(&par.src) as rune; // ] - break; - case => - memio::appendrune(&buf, rn)!; + match (bufio::scan_rune(sc)?) { + case io::EOF => + append(li, []); + break; + case let r: rune => + if (r != ' ') { + bufio::unreadrune(sc, r); }; + }; + + append(li, scan_paragraph(sc, loc)?); + + match (bufio::scan_rune(sc)?) { case io::EOF => break; + case let r: rune => + if (r != '-') { + bufio::unreadrune(sc, r); + break; + }; }; }; - let id = parse::identstr(memio::string(&buf)!) as ast::ident; - return id: reference; + + return li; }; -fn scansample(par: *parser) (token | void) = { - let nws = 0z; - for (true) { - match (bufio::read_rune(&par.src)!) { - case io::EOF => - return; - case let rn: rune => - switch (rn) { - case ' ' => - nws += 1; - case '\t' => - nws += 8; - case => - bufio::unreadrune(&par.src, rn); +// XXX: should be local to scan_paragraph, once that's possible +type state = enum { + NORMAL, + SPACE, + NEWLINE, +}; + +fn scan_paragraph( + sc: *bufio::scanner, + loc: *lex::location, +) (paragraph | ...error | utf8::invalid) = { + let p: paragraph = []; + let s = memio::dynamic(); + defer io::close(&s)!; + let state = state::NORMAL; + + for (true) match (bufio::scan_rune(sc)?) { + case io::EOF => + break; + case let r: rune => + switch (r) { + case '\t' => + if (state == state::NEWLINE && loc.col <= 1) { + bufio::unreadrune(sc, r); break; }; + loc.col += 8 - loc.col % 8; + if (state == state::NORMAL) { + state = state::SPACE; + }; + continue; + case '\n' => + loc.line += 1; + loc.col = 0; + if (state == state::NEWLINE) { + break; + }; + state = state::NEWLINE; + continue; + case ' ' => + loc.col += 1; + if (state == state::NORMAL) { + state = state::SPACE; + }; + continue; + case '-' => + if (state != state::NEWLINE || loc.col > 1) yield; + // XXX: we may want to reconsider if recognizing '-' + // here is too lenient (what if a line begins with a + // negative number?) + bufio::unreadrune(sc, r); + break; + case => void; }; - }; - if (nws <= 1) { - return scantext(par); - }; - let cont = true; - let buf = memio::dynamic(); - for (cont) { - const rn = match (bufio::read_rune(&par.src)!) { + if (state != state::NORMAL) { + memio::appendrune(&s, ' ')!; + }; + state = state::NORMAL; + loc.col += 1; + + if (r != '[') { + memio::appendrune(&s, r)!; + continue; + }; + + r = match (bufio::scan_rune(sc)?) { case io::EOF => + memio::appendrune(&s, '[')!; break; - case let rn: rune => - yield rn; + case let r: rune => + yield r; }; - switch (rn) { - case '\n' => - memio::appendrune(&buf, rn)!; - case => - memio::appendrune(&buf, rn)!; + if (r != '[') { + memio::appendrune(&s, '[')!; + bufio::unreadrune(sc, r); continue; }; - // Consume whitespace - for (let i = 0z; i < nws) { - match (bufio::read_rune(&par.src)!) { - case io::EOF => - break; - case let rn: rune => - switch (rn) { - case ' ' => - i += 1; - case '\t' => - i += 8; - case '\n' => - memio::appendrune(&buf, rn)!; - i = 0; - case => - bufio::unreadrune(&par.src, rn); - cont = false; - break; + loc.col += 1; + const part = memio::string(&s)!; + if (part != "") { + append(p, strings::dup(part)); + memio::reset(&s); + }; + + let lexer = lex::init(sc, loc.path); + const (ident, mod) = match (parse::ident_trailing(&lexer)) { + case let id: (ast::ident, bool) => + yield id; + case let err: lex::syntax => + if (err.0.line == 1) { + err.0.col += loc.col - 1; + }; + err.0.line += loc.line - 1; + return err; + case let err: io::error => + return err; + }; + + // intentionally not using lex::mkloc, so whitespace is + // accounted for + if (lexer.loc.0 == 1) { + loc.col += lexer.loc.1 - 1; + } else { + loc.col = 0; + }; + loc.line += lexer.loc.0 - 1; + + append(p, if (mod) ident: mod_ref else ident: decl_ref); + + if (lexer.un.0 == lex::ltok::RBRACKET) { + match (bufio::scan_rune(sc)?) { + case io::EOF => void; + case let r: rune => + if (r == ']') { + loc.col += 1; + continue; }; }; }; + return lex::syntaxerr(*loc, "Unterminated reference"); }; - let buf = memio::string(&buf)!; - // Trim trailing newlines - buf = strings::rtrim(buf, '\n'); - return buf: sample; + const part = memio::string(&s)!; + if (part != "") { + append(p, strings::dup(part)); + }; + return p; }; -fn scanlist(par: *parser) (token | void) = { - match (bufio::read_rune(&par.src)!) { - case io::EOF => - return void; - case let rn: rune => - if (rn != '-') { - abort(); +// Frees resources associated with a [[doc]]. +export fn freeall(doc: doc) void = { + for (let i = 0z; i < len(doc); i += 1) { + match (doc[i]) { + case let p: paragraph => + free_paragraph(p); + case let l: list => + for (let i = 0z; i < len(l); i += 1) { + free_paragraph(l[i]); + }; + free(l); + case let c: code_sample => + free(c); }; }; - const rn = match (bufio::read_rune(&par.src)!) { - case io::EOF => - return void; - case let rn: rune => - yield rn; - }; - if (rn != ' ') { - bufio::unreadrune(&par.src, rn); - return strings::dup("-"): text; + free(doc); +}; + +fn free_paragraph(p: paragraph) void = { + for (let i = 0z; i < len(p); i += 1) { + match (p[i]) { + case let s: str => + free(s); + case let d: decl_ref => + ast::ident_free(d); + case let m: mod_ref => + ast::ident_free(m); + }; }; - par.state = docstate::LIST; - return listitem; + free(p); };