commit c6fbfa5519948400676271027e801400732d7bdd
parent 235bdab44ca526567e112b23cd1aaff6dce042e4
Author: Drew DeVault <sir@cmpwn.com>
Date: Mon, 5 Apr 2021 15:30:50 -0400
encoding::xml: initial skeleton
Diffstat:
4 files changed, 204 insertions(+), 0 deletions(-)
diff --git a/encoding/xml/parser.ha b/encoding/xml/parser.ha
@@ -0,0 +1,144 @@
+use ascii;
+use bufio;
+use encoding::utf8;
+use io;
+use strings;
+
+// Returns an XML parser which reads from a stream. The caller must call
+// [parser_free] when they are finished with it.
+//
+// Hare's XML parser only supports UTF-8 encoded input files.
+//
+// This function will attempt to read the XML prologue before returning, and
+// will return an error if it is not valid.
+export fn parse(in: *io::stream) (*parser | error) = {
+ // XXX: The main reason we allocate this instead of returning it on the
+ // stack is so that we have a consistent address for the bufio buffer.
+ // This is kind of lame, maybe we can avoid that.
+ let par = alloc(parser {
+ orig = in,
+ in = in,
+ ...
+ });
+ if (!bufio::isbuffered(in)) {
+ par.in = bufio::buffered(par.in, par.buf[..], []);
+ };
+ prolog(par)?;
+ return par;
+};
+
+// Frees the resources associated with this parser. Does not close the
+// underlying stream.
+export fn parser_free(par: *parser) void = {
+ if (par.in != par.orig) {
+ io::close(par.in);
+ };
+ free(par);
+};
+
+fn prolog(par: *parser) (void | error) = {
+ want(par, "<?xml", WS);
+
+ want(par, "version", OPTWS, '=', OPTWS);
+ let quot = quote(par)?;
+ want(par, OPTWS, "1.")?;
+ for (true) match (bufio::scanrune(par.in)?) {
+ io::EOF => break,
+ rn: rune => if (!ascii::isdigit(rn)) {
+ bufio::unreadrune(par.in, rn);
+ break;
+ },
+ };
+ want(par, quot)?;
+
+ // TODO: Replace this with attribute() when it's written
+ let hadws = want(par, OPTWS)?;
+ let encoding = match (bufio::scanrune(par.in)) {
+ io::EOF => false,
+ rn: rune => {
+ bufio::unreadrune(par.in, rn);
+ hadws && rn == 'e';
+ },
+ };
+ if (encoding) {
+ want(par, "encoding", OPTWS, '=', OPTWS)?;
+ let quot = quote(par)?;
+ match (want(par, "UTF-8")) {
+ syntaxerr => return utf8::invalid,
+ err: error => return err,
+ bool => void,
+ };
+ want(par, quot)?;
+ };
+
+ let hadws = want(par, OPTWS)?;
+ let standalone = match (bufio::scanrune(par.in)) {
+ io::EOF => false,
+ rn: rune => {
+ bufio::unreadrune(par.in, rn);
+ hadws && rn == 's';
+ },
+ };
+ if (standalone) {
+ want(par, "standalone", OPTWS, '=', OPTWS)?;
+ let quot = quote(par)?;
+ // TODO: Should we support standalone="no"?
+ want(par, "yes", quot)?;
+ };
+
+ want(par, OPTWS, "?>")?;
+ // TODO: Parse doctypedecl & misc
+ return;
+};
+
+// Mandatory if true
+type whitespace = bool;
+def WS: whitespace = true;
+def OPTWS: whitespace = false;
+
+fn quote(par: *parser) (rune | error) = {
+ return match (bufio::scanrune(par.in)?) {
+ * => return syntaxerr,
+ rn: rune => switch (rn) {
+ '"', '\'' => rn,
+ * => return syntaxerr,
+ },
+ };
+};
+
+fn want(par: *parser, tok: (rune | str | whitespace)...) (bool | error) = {
+ let hadws = false;
+ for (let i = 0z; i < len(tok); i += 1) match (tok[i]) {
+ x: rune => {
+ let have = match (bufio::scanrune(par.in)?) {
+ * => return syntaxerr,
+ rn: rune => rn,
+ };
+ if (have != x) {
+ return syntaxerr;
+ };
+ },
+ x: str => {
+ let iter = strings::iter(x);
+ for (true) match (strings::next(&iter)) {
+ rn: rune => want(par, rn)?,
+ void => break,
+ };
+ },
+ ws: whitespace => {
+ let n = 0;
+ for (true; n += 1) match (bufio::scanrune(par.in)?) {
+ io::EOF => break,
+ rn: rune => if (!ascii::isspace(rn)) {
+ bufio::unreadrune(par.in, rn);
+ break;
+ },
+ };
+ if (ws && n < 1) {
+ return syntaxerr;
+ };
+ hadws = n >= 1;
+ },
+ };
+ return hadws;
+};
diff --git a/encoding/xml/types.ha b/encoding/xml/types.ha
@@ -0,0 +1,24 @@
+use encoding::utf8;
+use io;
+
+// A syntax error was encountered in the document.
+export type syntaxerr = void!; // TODO: Add line number?
+
+// Any error which can occur during XML parsing.
+export type error = (syntaxerr | utf8::invalid | io::error)!;
+
+// Represents the state for an XML parser.
+export type parser = struct {
+ orig: *io::stream,
+ in: *io::stream,
+ buf: [4096]u8,
+};
+
+// Converts an [error] to a user-friendly string representation.
+export fn strerror(err: error) const str = {
+ return match (err) {
+ syntaxerr => "Syntax error",
+ utf8::invalid => "Document is not valid UTF-8",
+ err: io::error => io::strerror(err),
+ };
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -213,6 +213,13 @@ encoding_utf8() {
gen_ssa encoding::utf8 types
}
+encoding_xml() {
+ gen_srcs encoding::xml \
+ types.ha \
+ parser.ha
+ gen_ssa encoding::xml io bufio strings
+}
+
endian() {
gen_srcs endian \
big.ha \
@@ -605,6 +612,7 @@ crypto_sha512
dirs
encoding_hex
encoding_utf8
+encoding_xml
endian
errors
fmt
diff --git a/stdlib.mk b/stdlib.mk
@@ -105,6 +105,9 @@ hare_stdlib_deps+=$(stdlib_encoding_hex)
stdlib_encoding_utf8=$(HARECACHE)/encoding/utf8/encoding_utf8.o
hare_stdlib_deps+=$(stdlib_encoding_utf8)
+stdlib_encoding_xml=$(HARECACHE)/encoding/xml/encoding_xml.o
+hare_stdlib_deps+=$(stdlib_encoding_xml)
+
stdlib_endian=$(HARECACHE)/endian/endian.o
hare_stdlib_deps+=$(stdlib_endian)
@@ -345,6 +348,17 @@ $(HARECACHE)/encoding/utf8/encoding_utf8.ssa: $(stdlib_encoding_utf8_srcs) $(std
@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::utf8 \
-t$(HARECACHE)/encoding/utf8/encoding_utf8.td $(stdlib_encoding_utf8_srcs)
+# encoding::xml
+stdlib_encoding_xml_srcs= \
+ $(STDLIB)/encoding/xml/types.ha \
+ $(STDLIB)/encoding/xml/parser.ha
+
+$(HARECACHE)/encoding/xml/encoding_xml.ssa: $(stdlib_encoding_xml_srcs) $(stdlib_rt) $(stdlib_io) $(stdlib_bufio) $(stdlib_strings)
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(HARECACHE)/encoding/xml
+ @HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::xml \
+ -t$(HARECACHE)/encoding/xml/encoding_xml.td $(stdlib_encoding_xml_srcs)
+
# endian
stdlib_endian_srcs= \
$(STDLIB)/endian/big.ha \
@@ -900,6 +914,9 @@ hare_testlib_deps+=$(testlib_encoding_hex)
testlib_encoding_utf8=$(TESTCACHE)/encoding/utf8/encoding_utf8.o
hare_testlib_deps+=$(testlib_encoding_utf8)
+testlib_encoding_xml=$(TESTCACHE)/encoding/xml/encoding_xml.o
+hare_testlib_deps+=$(testlib_encoding_xml)
+
testlib_endian=$(TESTCACHE)/endian/endian.o
hare_testlib_deps+=$(testlib_endian)
@@ -1144,6 +1161,17 @@ $(TESTCACHE)/encoding/utf8/encoding_utf8.ssa: $(testlib_encoding_utf8_srcs) $(te
@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::utf8 \
-t$(TESTCACHE)/encoding/utf8/encoding_utf8.td $(testlib_encoding_utf8_srcs)
+# encoding::xml
+testlib_encoding_xml_srcs= \
+ $(STDLIB)/encoding/xml/types.ha \
+ $(STDLIB)/encoding/xml/parser.ha
+
+$(TESTCACHE)/encoding/xml/encoding_xml.ssa: $(testlib_encoding_xml_srcs) $(testlib_rt) $(testlib_io) $(testlib_bufio) $(testlib_strings)
+ @printf 'HAREC \t$@\n'
+ @mkdir -p $(TESTCACHE)/encoding/xml
+ @HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::xml \
+ -t$(TESTCACHE)/encoding/xml/encoding_xml.td $(testlib_encoding_xml_srcs)
+
# endian
testlib_endian_srcs= \
$(STDLIB)/endian/big.ha \