commit d396fa03d53033441b5bfb2fd97c81835e09f57c
parent d0c9cd5f8380963cf9e5775a41caf9aea0393a7c
Author: Drew DeVault <sir@cmpwn.com>
Date: Tue, 6 Apr 2021 11:03:50 -0400
encoding::xml: parse element start
Diffstat:
5 files changed, 137 insertions(+), 14 deletions(-)
diff --git a/encoding/xml/chars.ha b/encoding/xml/chars.ha
@@ -0,0 +1,30 @@
+use ascii;
+
+fn isnamestart(rn: rune) bool = {
+ if (rn == ':' || rn == '_' || ascii::isalpha(rn)) return true;
+ let rn = rn: u32;
+ return
+ (rn >= 0xC0 && rn <= 0xD6) ||
+ (rn >= 0xD8 && rn <= 0xF6) ||
+ (rn >= 0xF8 && rn <= 0x2FF) ||
+ (rn >= 0x370 && rn <= 0x37D) ||
+ (rn >= 0x37F && rn <= 0x1FFF) ||
+ (rn >= 0x200C && rn <= 0x200D) ||
+ (rn >= 0x2070 && rn <= 0x218F) ||
+ (rn >= 0x2C00 && rn <= 0x2FEF) ||
+ (rn >= 0x3001 && rn <= 0xD7FF) ||
+ (rn >= 0xF900 && rn <= 0xFDCF) ||
+ (rn >= 0xFDF0 && rn <= 0xFFFD) ||
+ (rn >= 0x10000 && rn <= 0xEFFFF);
+};
+
+fn isname(rn: rune) bool = {
+ if (isnamestart(rn) || rn == '-' || rn == '.' || ascii::isdigit(rn)) {
+ return true;
+ };
+ let rn = rn: u32;
+ return
+ (rn == 0xB7) ||
+ (rn >= 0x300 && rn <= 0x36F) ||
+ (rn >= 0x203F && rn <= 0x2040);
+};
diff --git a/encoding/xml/parser.ha b/encoding/xml/parser.ha
@@ -3,6 +3,7 @@ use bufio;
use encoding::utf8;
use io;
use strings;
+use strio;
// Returns an XML parser which reads from a stream. The caller must call
// [parser_free] when they are finished with it.
@@ -36,10 +37,65 @@ export fn parser_free(par: *parser) void = {
free(par);
};
+// Scans for and returns the next [token]. The caller must pass the returned
+// token to [token_free] when they're done with it.
+export fn scan(par: *parser) (token | void | error) = {
+ want(par, OPTWS)?;
+ let rn: rune = match (bufio::scanrune(par.in)?) {
+ io::EOF => return void,
+ rn: rune => rn,
+ };
+ bufio::unreadrune(par.in, rn);
+ return switch (par.state) {
+ state::ELEMENT => switch (rn) {
+ '<' => {
+ let el = scan_element(par);
+ par.state = state::ATTRS;
+ el;
+ },
+ * => syntaxerr,
+ },
+ state::ATTRS => {
+ abort(); // TODO
+ },
+ };
+};
+
+fn scan_element(par: *parser) (token | error) = {
+ want(par, '<')?;
+ let name = scan_name(par)?;
+ return name: elementstart;
+};
+
+fn scan_name(par: *parser) (str | error) = {
+ let buf = strio::dynamic();
+
+ const rn = match (bufio::scanrune(par.in)?) {
+ io::EOF => return syntaxerr,
+ rn: rune => rn,
+ };
+ if (!isnamestart(rn)) {
+ return syntaxerr;
+ };
+ strio::appendrune(buf, rn);
+
+ for (true) match (bufio::scanrune(par.in)?) {
+ io::EOF => return syntaxerr,
+ rn: rune => if (isname(rn)) {
+ strio::appendrune(buf, rn);
+ } else {
+ bufio::unreadrune(par.in, rn);
+ break;
+ },
+ };
+
+ return strio::finish(buf);
+};
+
fn prolog(par: *parser) (void | error) = {
- want(par, "<?xml", WS);
+ want(par, "<?xml", WS)?;
- want(par, "version", OPTWS, '=', OPTWS);
+ want(par, "version", OPTWS, '=', OPTWS)?;
let quot = quote(par)?;
want(par, OPTWS, "1.")?;
for (true) match (bufio::scanrune(par.in)?) {
diff --git a/encoding/xml/types.ha b/encoding/xml/types.ha
@@ -1,19 +1,40 @@
use encoding::utf8;
use io;
-// A syntax error was encountered in the document.
-export type syntaxerr = void!; // TODO: Add line number?
-
-// Any error which can occur during XML parsing.
-export type error = (syntaxerr | utf8::invalid | io::error)!;
-
// Represents the state for an XML parser.
export type parser = struct {
orig: *io::stream,
in: *io::stream,
buf: [4096]u8,
+ state: state,
};
+export type state = enum {
+ ELEMENT,
+ ATTRS,
+};
+
+// The start of an XML element, e.g. <example
+export type elementstart = str;
+
+// The end of an XML element, e.g. /> or </example>
+export type elementend = void;
+
+// An attribute of an XML element, e.g. foo="bar"
+export type attribute = (str, str);
+
+// Text content of an XML element, e.g. baz or <![CDATA[baz]]>
+export type text = str;
+
+// Any valid XML token
+export type token = (elementstart | elementend | attribute | text);
+
+// A syntax error was encountered in the document.
+export type syntaxerr = void!; // TODO: Add line number?
+
+// Any error which can occur during XML parsing.
+export type error = (syntaxerr | utf8::invalid | io::error)!;
+
// Converts an [error] to a user-friendly string representation.
export fn strerror(err: error) const str = {
return match (err) {
@@ -22,3 +43,16 @@ export fn strerror(err: error) const str = {
err: io::error => io::strerror(err),
};
};
+
+// Frees resources associated with a [token].
+export fn token_free(tok: token) void = {
+ match (tok) {
+ el: elementstart => free(el),
+ attr: attribute => {
+ free(attr.0);
+ free(attr.1);
+ },
+ tx: text => free(tx),
+ elementend => void,
+ };
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -216,8 +216,9 @@ encoding_utf8() {
encoding_xml() {
gen_srcs encoding::xml \
types.ha \
- parser.ha
- gen_ssa encoding::xml io bufio strings
+ parser.ha \
+ chars.ha
+ gen_ssa encoding::xml io bufio strings ascii strio
}
endian() {
diff --git a/stdlib.mk b/stdlib.mk
@@ -351,9 +351,10 @@ $(HARECACHE)/encoding/utf8/encoding_utf8.ssa: $(stdlib_encoding_utf8_srcs) $(std
# encoding::xml
stdlib_encoding_xml_srcs= \
$(STDLIB)/encoding/xml/types.ha \
- $(STDLIB)/encoding/xml/parser.ha
+ $(STDLIB)/encoding/xml/parser.ha \
+ $(STDLIB)/encoding/xml/chars.ha
-$(HARECACHE)/encoding/xml/encoding_xml.ssa: $(stdlib_encoding_xml_srcs) $(stdlib_rt) $(stdlib_io) $(stdlib_bufio) $(stdlib_strings)
+$(HARECACHE)/encoding/xml/encoding_xml.ssa: $(stdlib_encoding_xml_srcs) $(stdlib_rt) $(stdlib_io) $(stdlib_bufio) $(stdlib_strings) $(stdlib_ascii) $(stdlib_strio)
@printf 'HAREC \t$@\n'
@mkdir -p $(HARECACHE)/encoding/xml
@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::xml \
@@ -1167,9 +1168,10 @@ $(TESTCACHE)/encoding/utf8/encoding_utf8.ssa: $(testlib_encoding_utf8_srcs) $(te
# encoding::xml
testlib_encoding_xml_srcs= \
$(STDLIB)/encoding/xml/types.ha \
- $(STDLIB)/encoding/xml/parser.ha
+ $(STDLIB)/encoding/xml/parser.ha \
+ $(STDLIB)/encoding/xml/chars.ha
-$(TESTCACHE)/encoding/xml/encoding_xml.ssa: $(testlib_encoding_xml_srcs) $(testlib_rt) $(testlib_io) $(testlib_bufio) $(testlib_strings)
+$(TESTCACHE)/encoding/xml/encoding_xml.ssa: $(testlib_encoding_xml_srcs) $(testlib_rt) $(testlib_io) $(testlib_bufio) $(testlib_strings) $(testlib_ascii) $(testlib_strio)
@printf 'HAREC \t$@\n'
@mkdir -p $(TESTCACHE)/encoding/xml
@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::xml \