encoding::base64: new module - hare - The Hare programming language

commit 5b7f3ea99c224c8b050a79cfbe5a11c8f32c62b7
parent 548c618e08115fbffb5dece720d36a01d9b95a17
Author: Steven Guikal <void@fluix.one>
Date:   Mon, 19 Jul 2021 17:27:30 -0400

encoding::base64: new module

The acceptance of invalid padding is not implemented because of the
security considerations described in the RFC[1] and the fact that a
`no_padding` return type would make verifying that no padding was used
more difficult[2]. Should this be desired, a dedicated alphabet type or
set of flags allowing explicitly no padding, any padding, or (perhaps)
any invalid characters at all would be more clear

[1]https://datatracker.ietf.org/doc/html/rfc4648#section-12
[2]One would have to check that either `no_padding` was returned or, if it
wasn't, that the length of the data is a multiple of 4 and thus didn't
have padding in the first place. The latter part is important otherwise
an attacker could covertly send padding.

Signed-off-by: Steven Guikal <void@fluix.one>

Diffstat:
A encoding/base64/README  | 5 +++++
A encoding/base64/base64.ha  | 348 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M scripts/gen-stdlib  | 7 +++++++
M stdlib.mk  | 28 ++++++++++++++++++++++++++++

4 files changed, 388 insertions(+), 0 deletions(-)
diff --git a/encoding/base64/README b/encoding/base64/README
@@ -0,0 +1,5 @@
+Implementation of the base 64 encoding as per RFC 4648. This implementation
+does not support invalid padding due to security concerns described in the
+RFC.
+
+https://datatracker.ietf.org/doc/html/rfc4648#section-12
diff --git a/encoding/base64/base64.ha b/encoding/base64/base64.ha
@@ -0,0 +1,348 @@
+use bufio;
+use bytes;
+use io;
+use strio;
+use strings;
+
+// RFC 4648 standard "base64" base 64 encoding alphabet.
+export const standard: []u8 = [
+	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+	'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+	'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+	'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+	'w', 'x', 'y', 'z', '0', '1', '2', '3',
+	'4', '5', '6', '7', '8', '9', '+', '/'
+];
+
+// RFC 4648 URL and filename safe "base64url" base 64 encoding alphabet.
+export const urlsafe: []u8 = [
+	'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
+	'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
+	'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
+	'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f',
+	'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
+	'o', 'p', 'q', 'r', 's', 't', 'u', 'v',
+	'w', 'x', 'y', 'z', '0', '1', '2', '3',
+	'4', '5', '6', '7', '8', '9', '-', '_'
+];
+
+// The padding character used at the end of encoding.
+export def PADDING: u8 = '=': u32: u8;
+
+// Indicates that invalid input was found while decoding, either in the form of
+// characters outside of the base 64 alphabet, insufficient padding, or trailing
+// characters. Contains the index of the first invalid character.
+export type invalid = !size;
+
+// Encodes a byte slice using a base 64 encoding alphabet, with padding, and
+// writes it to an [[io::stream]]. The number of bytes written is returned.
+export fn encode(
+	alphabet: []u8,
+	sink: *io::stream,
+	b: []u8
+) (size | io::error) = {
+	let z = 0z;
+	let i = 0z;
+	for (i + 2 < len(b); i += 3) {
+		z += io::write(sink, [
+			alphabet[b[i] >> 2],
+			alphabet[(b[i] & 0x3) << 4 | b[i + 1] >> 4],
+			alphabet[(b[i + 1] & 0xf) << 2 | b[i + 2] >> 6],
+			alphabet[b[i + 2] & 0x3F],
+		])?;
+	};
+	if (len(b) - i > 0) {
+		if (len(b) - i == 2) {
+			z += io::write(sink, [
+				alphabet[b[i] >> 2],
+				alphabet[(b[i] & 0x3) << 4 | b[i + 1] >> 4],
+				alphabet[(b[i + 1] & 0xf) << 2],
+				PADDING,
+			])?;
+		} else {
+			z += io::write(sink, [
+				alphabet[b[i] >> 2],
+				alphabet[(b[i] & 0x3) << 4],
+				PADDING,
+				PADDING,
+			])?;
+		};
+	};
+	return z;
+};
+
+// Calls [[encode]] with the [[standard]] base 64 encoding alphabet.
+export fn stdencode(
+	sink: *io::stream,
+	b: []u8,
+) (size | io::error) = encode(standard, sink, b);
+
+// Calls [[encode]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urlencode(
+	sink: *io::stream,
+	b: []u8,
+) (size | io::error) = encode(urlsafe, sink, b);
+
+// Encodes a byte slice using a base 64 encoding alphabet, with padding, and
+// returns it. The caller must free the return value.
+export fn encodestr(alphabet: []u8, b: []u8) str = {
+	let sink = strio::dynamic();
+	encode(alphabet, sink, b) as size;
+	return strio::finish(sink);
+};
+
+// Calls [[encodestr]] with the [[standard]] base 64 encoding alphabet.
+export fn stdencodestr(b: []u8) str = encodestr(standard, b);
+
+// Calls [[encodestr]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urlencodestr(b: []u8) str = encodestr(urlsafe, b);
+
+@test fn encode() void = {
+	const in: [_]u8 = ['f', 'o', 'o', 'b', 'a', 'r'];
+	const expect: [_]str = [
+		"",
+		"Zg==",
+		"Zm8=",
+		"Zm9v",
+		"Zm9vYg==",
+		"Zm9vYmE=",
+		"Zm9vYmFy"
+	];
+	for (let i = 0z; i < len(in); i += 1) {
+		let s = encodestr(standard, in[..i]);
+		defer free(s);
+		assert(s == expect[i]);
+	};
+};
+
+// Decodes base 64-encoded data in the given base 64 alphabet, with padding,
+// from an [[io::stream]]. The number of bytes written is returned.
+export fn decode(
+	alphabet: []u8,
+	in: *io::stream,
+	out: *io::stream,
+) (size | invalid | io::error) = {
+	const INVALID_OR_PAD = 255u8;
+	let decoder: [256]u8 = [INVALID_OR_PAD...];
+	for (let i = 0z; i < len(alphabet); i += 1) {
+		decoder[alphabet[i]] = i: u8;
+	};
+
+	let count = 0z;
+	let z = 0z;
+	for (true) {
+		let buf: [4]u8 = [0...];
+		match (io::read(in, buf)) {
+			size => {
+				for (let i = 0z; i < 2; i += 1) {
+					if (decoder[buf[i]] == INVALID_OR_PAD) {
+						return (count + i): invalid;
+					} else {
+						buf[i] = decoder[buf[i]];
+					};
+				};
+
+				if (decoder[buf[2]] == INVALID_OR_PAD) {
+					if (buf[2] != PADDING) {
+						return (count + 2z): invalid;
+					};
+					if (buf[3] != PADDING) {
+						return (count + 3z): invalid;
+					};
+					z += io::write(out, [
+						buf[0] << 2 | buf[1] >> 4,
+					])?;
+					let extra: []u8 = [0];
+					return match (io::read(in, extra)) {
+						size => (count + 4z): invalid,
+						io::EOF => z,
+					};
+				} else {
+					buf[2] = decoder[buf[2]];
+				};
+
+				if (decoder[buf[3]] == INVALID_OR_PAD) {
+					if (buf[3] != PADDING) {
+						return (count + 3z): invalid;
+					};
+					z += io::write(out, [
+						buf[0] << 2 | buf[1] >> 4,
+						buf[1] << 4 | buf[2] >> 2,
+					])?;
+					let extra: []u8 = [0];
+					return match (io::read(in, extra)) {
+						size => (count + 4z): invalid,
+						io::EOF => z,
+					};
+				} else {
+					buf[3] = decoder[buf[3]];
+				};
+
+				z += io::write(out, [
+					buf[0] << 2 | buf[1] >> 4,
+					buf[1] << 4 | buf[2] >> 2,
+					buf[2] << 6 | buf[3],
+				])?;
+				count += 4;
+			},
+			io::EOF => {
+				break;
+			},
+		};
+	};
+	return z;
+};
+
+// Calls [[decode]] with the [[standard]] base 64 encoding alphabet.
+export fn stddecode(
+	in: *io::stream,
+	out: *io::stream,
+) (size | invalid | io::error) = decode(standard, in, out);
+
+// Calls [[decode]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urldecode(
+	in: *io::stream,
+	out: *io::stream,
+) (size | invalid | io::error) = decode(urlsafe, in, out);
+
+// Decodes base 64-encoded data in the given base 64 alphabet, with padding,
+// from an [[io::stream]]. Every four input bytes are guaranteed to either be
+// decoded into the user-provided buffer or return [[invalid]]. The number of
+// bytes written is returned.
+export fn decode_static(
+	alphabet: []u8,
+	out: []u8,
+	in: *io::stream,
+) (size | invalid) = {
+	let buf = bufio::fixed(out, io::mode::WRITE);
+	defer io::close(buf);
+	return match (decode(alphabet, in, buf)) {
+		io::error => abort(),
+		z: invalid => z: invalid,
+		z: size => z,
+	};
+};
+
+// Calls [[decode_static]] with the [[standard]] base 64 encoding alphabet.
+export fn stddecode_static(out: []u8, in: *io::stream) (size | invalid) = {
+	return decode_static(standard, out, in);
+};
+
+// Calls [[decode_static]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urldecode_static(out: []u8, in: *io::stream) (size | invalid) = {
+	return decode_static(urlsafe, out, in);
+};
+
+// Decodes a string of base 64-encoded data in the given base 64 encoding
+// alphabet, with padding, into a byte slice. The caller must free the return
+// value.
+export fn decodestr(alphabet: []u8, in: str) ([]u8 | invalid) = {
+	return decodeslice(alphabet, strings::toutf8(in));
+};
+
+// Calls [[decodestr]] with the [[standard]] base 64 encoding alphabet.
+export fn stddecodestr(in: str) ([]u8 | invalid) = decodestr(standard, in);
+
+// Calls [[decodestr]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urldecodestr(in: str) ([]u8 | invalid) = decodestr(urlsafe, in);
+
+// Decodes a string of base 64-encoded data in the given base 64 encoding
+// alphabet, with padding. Every four input bytes are guaranteed to either be
+// decoded into the user-provided buffer or return [[invalid]]. The number of
+// bytes written is returned.
+export fn decodestr_static(
+	alphabet: []u8,
+	out: []u8,
+	in: str,
+) (size | invalid) = {
+	return decodeslice_static(alphabet, out, strings::toutf8(in));
+};
+
+// Calls [[decodestr_static]] with the [[standard]] base 64 encoding alphabet.
+export fn stddecodestr_static(out: []u8, in: str) (size | invalid) = {
+	return decodestr_static(standard, out, in);
+};
+
+// Calls [[decodestr_static]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urldecodestr_static(out: []u8, in: str) (size | invalid) = {
+	return decodestr_static(urlsafe, out, in);
+};
+
+// Decodes a byte slice of base 64-encoded data in the given base 64 encoding
+// alphabet, with padding, into a byte slice. The caller must free the return
+// value.
+export fn decodeslice(alphabet: []u8, in: []u8) ([]u8 | invalid) = {
+	let out = bufio::dynamic(io::mode::WRITE);
+	let in = bufio::fixed(in, io::mode::READ);
+	defer io::close(in);
+	return match (decode(alphabet, in, out)) {
+		io::error => abort(),
+		z: invalid => z: invalid,
+		size => bufio::finish(out),
+	};
+};
+
+// Calls [[decodeslice]] with the [[standard]] base 64 encoding alphabet.
+export fn stddecodeslice(in: []u8) ([]u8 | invalid) = decodeslice(standard, in);
+
+// Calls [[decodeslice]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urldecodeslice(in: []u8) ([]u8 | invalid) = decodeslice(urlsafe, in);
+
+// Decodes a byte slice of base 64-encoded data in the given base 64 encoding
+// alphabet, with padding. Every four input bytes are guaranteed to either be
+// decoded into the user-provided buffer or return [[invalid]]. The number of
+// bytes written is returned.
+export fn decodeslice_static(
+	alphabet: []u8,
+	out: []u8,
+	in: []u8,
+) (size | invalid) = {
+	let in = bufio::fixed(in, io::mode::READ);
+	defer io::close(in); // bufio::finish?
+	return decode_static(alphabet, out, in);
+};
+
+// Calls [[decodeslice_static]] with the [[standard]] base 64 encoding alphabet.
+export fn stddecodeslice_static(out: []u8, in: []u8) (size | invalid) = {
+	return decodeslice_static(standard, out, in);
+};
+
+// Calls [[decodeslice_static]] with the [[urlsafe]] base 64 encoding alphabet.
+export fn urldecodeslice_static(out: []u8, in: []u8) (size | invalid) = {
+	return decodeslice_static(urlsafe, out, in);
+};
+
+@test fn decode() void = {
+	const in: [_]str = [
+		"",
+		"Zg==",
+		"Zm8=",
+		"Zm9v",
+		"Zm9vYg==",
+		"Zm9vYmE=",
+		"Zm9vYmFy",
+	];
+	const expect: [_]u8 = ['f', 'o', 'o', 'b', 'a', 'r'];
+	for (let i = 0z; i < len(in); i += 1) {
+		let s = stddecodestr(in[i]) as []u8;
+		defer free(s);
+		assert(bytes::equal(s, expect[..i]));
+	};
+	
+	const bad: [_]str = [
+		"A",
+		"AA",
+		"AAA",
+		"!!!!",
+		"====",
+		"A=A=",
+		"AA=A",
+		"Zg==Zg==",
+	];
+	const badindex: [_]size = [1, 2, 3, 0, 0, 1, 3, 4];
+	for (let i = 0z; i < len(bad); i += 1) {
+		assert(stddecodestr(bad[i]) as invalid == badindex[i]: invalid);
+	};
+};
diff --git a/scripts/gen-stdlib b/scripts/gen-stdlib
@@ -244,6 +244,12 @@ dirs() {
 	gen_ssa dirs fs os path
 }
 
+encoding_base64() {
+	gen_srcs encoding::base64 \
+		base64.ha
+	gen_ssa encoding::base64 bufio bytes io strio strings
+}
+
 encoding_hex() {
 	gen_srcs encoding::hex \
 		hex.ha
@@ -827,6 +833,7 @@ crypto::sha1
 crypto::sha256
 crypto::sha512
 dirs
+encoding::base64
 encoding::hex
 encoding::utf8
 endian
diff --git a/stdlib.mk b/stdlib.mk
@@ -129,6 +129,10 @@ hare_stdlib_deps+=$(stdlib_crypto_sha512)
 stdlib_dirs=$(HARECACHE)/dirs/dirs.o
 hare_stdlib_deps+=$(stdlib_dirs)
 
+# gen_lib encoding::base64
+stdlib_encoding_base64=$(HARECACHE)/encoding/base64/encoding_base64.o
+hare_stdlib_deps+=$(stdlib_encoding_base64)
+
 # gen_lib encoding::hex
 stdlib_encoding_hex=$(HARECACHE)/encoding/hex/encoding_hex.o
 hare_stdlib_deps+=$(stdlib_encoding_hex)
@@ -489,6 +493,16 @@ $(HARECACHE)/dirs/dirs.ssa: $(stdlib_dirs_srcs) $(stdlib_rt) $(stdlib_fs) $(stdl
 	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Ndirs \
 		-t$(HARECACHE)/dirs/dirs.td $(stdlib_dirs_srcs)
 
+# encoding::base64
+stdlib_encoding_base64_srcs= \
+	$(STDLIB)/encoding/base64/base64.ha
+
+$(HARECACHE)/encoding/base64/encoding_base64.ssa: $(stdlib_encoding_base64_srcs) $(stdlib_rt) $(stdlib_bufio) $(stdlib_bytes) $(stdlib_io) $(stdlib_strio) $(stdlib_strings)
+	@printf 'HAREC \t$@\n'
+	@mkdir -p $(HARECACHE)/encoding/base64
+	@HARECACHE=$(HARECACHE) $(HAREC) $(HAREFLAGS) -o $@ -Nencoding::base64 \
+		-t$(HARECACHE)/encoding/base64/encoding_base64.td $(stdlib_encoding_base64_srcs)
+
 # encoding::hex
 stdlib_encoding_hex_srcs= \
 	$(STDLIB)/encoding/hex/hex.ha
@@ -1297,6 +1311,10 @@ hare_testlib_deps+=$(testlib_crypto_sha512)
 testlib_dirs=$(TESTCACHE)/dirs/dirs.o
 hare_testlib_deps+=$(testlib_dirs)
 
+# gen_lib encoding::base64
+testlib_encoding_base64=$(TESTCACHE)/encoding/base64/encoding_base64.o
+hare_testlib_deps+=$(testlib_encoding_base64)
+
 # gen_lib encoding::hex
 testlib_encoding_hex=$(TESTCACHE)/encoding/hex/encoding_hex.o
 hare_testlib_deps+=$(testlib_encoding_hex)
@@ -1664,6 +1682,16 @@ $(TESTCACHE)/dirs/dirs.ssa: $(testlib_dirs_srcs) $(testlib_rt) $(testlib_fs) $(t
 	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Ndirs \
 		-t$(TESTCACHE)/dirs/dirs.td $(testlib_dirs_srcs)
 
+# encoding::base64
+testlib_encoding_base64_srcs= \
+	$(STDLIB)/encoding/base64/base64.ha
+
+$(TESTCACHE)/encoding/base64/encoding_base64.ssa: $(testlib_encoding_base64_srcs) $(testlib_rt) $(testlib_bufio) $(testlib_bytes) $(testlib_io) $(testlib_strio) $(testlib_strings)
+	@printf 'HAREC \t$@\n'
+	@mkdir -p $(TESTCACHE)/encoding/base64
+	@HARECACHE=$(TESTCACHE) $(HAREC) $(TESTHAREFLAGS) -o $@ -Nencoding::base64 \
+		-t$(TESTCACHE)/encoding/base64/encoding_base64.td $(testlib_encoding_base64_srcs)
+
 # encoding::hex
 testlib_encoding_hex_srcs= \
 	$(STDLIB)/encoding/hex/hex.ha

	hare The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	encoding/base64/README	\|	5	+++++
A	encoding/base64/base64.ha	\|	348	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	scripts/gen-stdlib	\|	7	+++++++
M	stdlib.mk	\|	28	++++++++++++++++++++++++++++