crypto::ec: port the i31 p256 implementation from BearSSL - hare

commit 444720781794b0c23d350bbf9ebc4099d0f4f188
parent dd80eb36907f0c8f5d01145cf240f49fdd56f5e9
Author: Armin Preiml <apreiml@strohwolke.at>
Date:   Thu,  7 Mar 2024 14:50:05 +0100

crypto::ec: port the i31 p256 implementation from BearSSL

Signed-off-by: Armin Preiml <apreiml@strohwolke.at>

Diffstat:
A crypto/ec/curves+test.ha  | 453 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A crypto/ec/genkeys+test.ha  | 34 ++++++++++++++++++++++++++++++++++
A crypto/ec/p256.ha  | 1252 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

3 files changed, 1739 insertions(+), 0 deletions(-)
diff --git a/crypto/ec/curves+test.ha b/crypto/ec/curves+test.ha
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: MPL-2.0
+// (c) Hare authors <https://harelang.org>
+
+use bytes;
+
+
+// Testcases have been generated with help of pycryptdome.
+
+type multc = struct {
+	a: []u8,
+	b: []u8,
+	x: []u8,
+	y: []u8,
+	result: u32,
+	expected: []u8,
+};
+
+fn tmuladd(c: *curve, tcs: []multc) void = {
+	for (let i = 0z; i < len(tcs); i += 1) {
+		let t = tcs[i];
+		assert(c.muladd(t.a, t.b, t.x, t.y) == t.result);
+		if (t.result == 1) {
+			assert(bytes::equal(t.a, t.expected));
+		};
+	};
+};
+
+@test fn p256_mulgen() void = {
+	const c = p256;
+	// multiply by 1
+	let m1: [P256_SCALARSZ]u8 = [0...];
+	m1[len(m1) - 1] = 1;
+
+	let g = alloc(P256_G);
+	defer free(g);
+	assert(c.mul(g, m1) == 1);
+	assert(bytes::equal(g, P256_G));
+
+	assert(c.mulgen(g, m1) == 65);
+	assert(bytes::equal(g, P256_G));
+
+	// multiply by order - 1
+	let o = alloc(P256_N);
+	defer free(o);
+	o[len(o) - 1] -= 1;
+
+	const expected: [_]u8 = [
+		0x04, 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8,
+		0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x03, 0x7d,
+		0x81, 0x2d, 0xeb, 0x33, 0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8,
+		0x98, 0xc2, 0x96, 0xb0, 0x1c, 0xbd, 0x1c, 0x01, 0xe5, 0x80,
+		0x65, 0x71, 0x18, 0x14, 0xb5, 0x83, 0xf0, 0x61, 0xe9, 0xd4,
+		0x31, 0xcc, 0xa9, 0x94, 0xce, 0xa1, 0x31, 0x34, 0x49, 0xbf,
+		0x97, 0xc8, 0x40, 0xae, 0x0a,
+	];
+
+	assert(c.mul(g, o) == 1);
+	assert(bytes::equal(g, expected));
+
+	let g = alloc(P256_G);
+	defer free(g);
+	assert(c.mulgen(g, o) == 65);
+	assert(bytes::equal(g, expected));
+	let priv: [_]u8 = [
+		0xde, 0x5c, 0x88, 0x05, 0x42, 0xa0, 0x71, 0xe2, 0xf6, 0xfe,
+		0xd0, 0xdc, 0x80, 0x07, 0x37, 0xc4, 0x35, 0xa6, 0x29, 0x48,
+		0x85, 0x70, 0x4f, 0x54, 0x1c, 0x41, 0x89, 0xaf, 0xf6, 0xbc,
+		0xb5, 0x19,
+	];
+	const expected: [_]u8 = [
+		0x04, 0x8a, 0x0d, 0x13, 0x84, 0x8e, 0x4f, 0xdb, 0x05, 0x83,
+		0x8e, 0x76, 0x24, 0xf9, 0x8a, 0xb2, 0x83, 0x5a, 0x79, 0xb0,
+		0x65, 0x63, 0x9b, 0x4c, 0x30, 0xcf, 0x69, 0x31, 0x46, 0xf7,
+		0x10, 0x49, 0x13, 0x46, 0x0b, 0xb0, 0x4e, 0x3e, 0x44, 0x7f,
+		0xad, 0x96, 0xfa, 0xfb, 0xc5, 0x78, 0xc0, 0xee, 0xc6, 0xc4,
+		0x88, 0xa3, 0xe5, 0xc3, 0x10, 0x3c, 0x63, 0x24, 0xc1, 0x48,
+		0x43, 0x37, 0x13, 0xdf, 0xc9,
+	];
+
+	let g = alloc(P256_G);
+	assert(c.mul(g, priv) == 1);
+	assert(bytes::equal(g, expected));
+
+	let r: [65]u8 = [0...];
+	assert(c.mulgen(r, priv) == 65);
+	assert(bytes::equal(r, expected));
+};
+
+@test fn p256_muladd() void = {
+	let c = p256;
+
+	let tcs: [_]multc = [
+		multc {
+			a = [
+				0x04, 0xee, 0x19, 0x71, 0x69, 0xae, 0xbd, 0x94,
+				0xc7, 0x8d, 0x4e, 0x7f, 0x05, 0x06, 0xb4, 0xdf,
+				0x60, 0x43, 0x22, 0x2c, 0x50, 0x8d, 0x33, 0x8a,
+				0x5a, 0xd0, 0x02, 0x60, 0x95, 0x5e, 0xda, 0x31,
+				0x64, 0x4c, 0x7e, 0x63, 0xa9, 0xa0, 0xaf, 0x56,
+				0xbb, 0x7c, 0x03, 0x84, 0x8a, 0x44, 0x30, 0xfd,
+				0x39, 0x1d, 0xc0, 0x0f, 0x5c, 0x50, 0xcf, 0xda,
+				0xf1, 0x99, 0x69, 0x8e, 0x77, 0x62, 0x56, 0x34,
+				0xce,
+			],
+			b = [],
+			x = [
+				0xef, 0xcb, 0x7a, 0x72, 0x9c, 0xb4, 0x6a, 0xe6,
+				0xb4, 0x66, 0xa6, 0x8c, 0x1a, 0x18, 0x64, 0x5c,
+				0xd8, 0xdc, 0x9d, 0xa2, 0x5b, 0x48, 0x45, 0xcd,
+				0x68, 0x0c, 0x5f, 0x93, 0x1b, 0x74, 0x42, 0x0b,
+			],
+			y = [
+				0x7b, 0x6b, 0x8f, 0x14, 0x8b, 0x7a, 0x5d, 0xbe,
+				0xa5, 0xd0, 0x6b, 0x7a, 0x3c, 0xaa, 0x6f, 0x4b,
+				0x2a, 0xeb, 0x52, 0x7e, 0x80, 0xdb, 0xc5, 0x95,
+				0x73, 0x91, 0xab, 0x3d, 0x29, 0xbc, 0xbd, 0x74,
+			],
+			result = 1,
+			expected = [
+				0x04, 0x85, 0x93, 0xaf, 0x32, 0x68, 0x1c, 0xa8,
+				0xba, 0xf8, 0x64, 0x4e, 0xd3, 0xd8, 0xf3, 0x52,
+				0x88, 0xdf, 0x3b, 0x92, 0x5b, 0xb4, 0xbb, 0xf6,
+				0xd4, 0x2c, 0xce, 0x42, 0x8c, 0xa5, 0x9b, 0xd4,
+				0xb0, 0x87, 0xac, 0xfe, 0x83, 0xcc, 0x11, 0xd1,
+				0x98, 0x3e, 0x63, 0xc8, 0xba, 0xc1, 0x8e, 0x7e,
+				0xf2, 0xd1, 0xa7, 0x78, 0x93, 0xa8, 0x71, 0x9e,
+				0xee, 0xd2, 0xf9, 0xe3, 0xf3, 0x0a, 0xda, 0xf6,
+				0x26,
+			],
+		},
+		multc {
+			a = [
+				0x04, 0xee, 0x19, 0x71, 0x69, 0xae, 0xbd, 0x94,
+				0xc7, 0x8d, 0x4e, 0x7f, 0x05, 0x06, 0xb4, 0xdf,
+				0x60, 0x43, 0x22, 0x2c, 0x50, 0x8d, 0x33, 0x8a,
+				0x5a, 0xd0, 0x02, 0x60, 0x95, 0x5e, 0xda, 0x31,
+				0x64, 0x4c, 0x7e, 0x63, 0xa9, 0xa0, 0xaf, 0x56,
+				0xbb, 0x7c, 0x03, 0x84, 0x8a, 0x44, 0x30, 0xfd,
+				0x39, 0x1d, 0xc0, 0x0f, 0x5c, 0x50, 0xcf, 0xda,
+				0xf1, 0x99, 0x69, 0x8e, 0x77, 0x62, 0x56, 0x34,
+				0xce,
+			],
+			b = [
+				0x04, 0xbd, 0x52, 0x07, 0x83, 0x40, 0x88, 0x6a,
+				0xa5, 0x24, 0xd8, 0x22, 0x13, 0x4f, 0xc3, 0xf3,
+				0x03, 0xca, 0xe1, 0xd3, 0x5e, 0x01, 0x95, 0x82,
+				0x5f, 0xa9, 0x95, 0x9f, 0xc3, 0xc4, 0x92, 0x8f,
+				0xd2, 0x30, 0x18, 0x56, 0x29, 0x93, 0x74, 0x50,
+				0xbd, 0xa6, 0x8d, 0x88, 0xf6, 0x03, 0xd6, 0x16,
+				0xd9, 0x9d, 0x01, 0x82, 0xbe, 0x08, 0x13, 0xec,
+				0x9f, 0xb4, 0xb1, 0x18, 0xbc, 0x14, 0x09, 0x31,
+				0xad,
+			],
+			x = [
+				0xef, 0xcb, 0x7a, 0x72, 0x9c, 0xb4, 0x6a, 0xe6,
+				0xb4, 0x66, 0xa6, 0x8c, 0x1a, 0x18, 0x64, 0x5c,
+				0xd8, 0xdc, 0x9d, 0xa2, 0x5b, 0x48, 0x45, 0xcd,
+				0x68, 0x0c, 0x5f, 0x93, 0x1b, 0x74, 0x42, 0x0b,
+			],
+			y = [
+				0x7b, 0x6b, 0x8f, 0x14, 0x8b, 0x7a, 0x5d, 0xbe,
+				0xa5, 0xd0, 0x6b, 0x7a, 0x3c, 0xaa, 0x6f, 0x4b,
+				0x2a, 0xeb, 0x52, 0x7e, 0x80, 0xdb, 0xc5, 0x95,
+				0x73, 0x91, 0xab, 0x3d, 0x29, 0xbc, 0xbd, 0x74,
+			],
+			result = 1,
+			expected = [
+				0x04, 0x7f, 0x66, 0x4b, 0x8c, 0x3e, 0x64, 0x5b,
+				0xc1, 0x97, 0x60, 0xac, 0xa6, 0xc3, 0x0c, 0x17,
+				0x80, 0xff, 0xd0, 0x95, 0xcd, 0x4d, 0x5b, 0xf1,
+				0x35, 0x25, 0x53, 0x5a, 0xec, 0xff, 0xb1, 0xd9,
+				0xdb, 0xdd, 0x96, 0xe3, 0xfb, 0x84, 0x68, 0x2b,
+				0x53, 0x04, 0x4b, 0xcd, 0x7c, 0x4e, 0x10, 0xfa,
+				0x87, 0x4c, 0x14, 0x43, 0xf0, 0x1b, 0x57, 0x39,
+				0x02, 0x35, 0x44, 0xef, 0xa0, 0x38, 0xb5, 0x70,
+				0xb8,
+			],
+		},
+		// invalid a
+		multc {
+			a = [
+				0x04, 0xee, 0x19, 0x71, 0x69, 0xae, 0xbd, 0x94,
+				0xc7, 0x8d, 0x4e, 0x0f, 0x05, 0x06, 0xb4, 0xdf,
+				0x60, 0x43, 0x22, 0x0c, 0x50, 0x8d, 0x33, 0x8a,
+				0x5a, 0xd0, 0x02, 0x00, 0x95, 0x5e, 0xda, 0x31,
+				0x64, 0x4c, 0x7e, 0x03, 0xa9, 0xa0, 0xaf, 0x56,
+				0xbb, 0x7c, 0x03, 0x04, 0x8a, 0x44, 0x30, 0xfd,
+				0x39, 0x1d, 0xc0, 0x0f, 0x5c, 0x50, 0xcf, 0xda,
+				0xf1, 0x99, 0x69, 0x0e, 0x77, 0x62, 0x56, 0x34,
+				0xce,
+			],
+			b = [
+				0x04, 0xbd, 0x52, 0x07, 0x83, 0x40, 0x88, 0x6a,
+				0xa5, 0x24, 0xd8, 0x22, 0x13, 0x4f, 0xc3, 0xf3,
+				0x03, 0xca, 0xe1, 0xd3, 0x5e, 0x01, 0x95, 0x82,
+				0x5f, 0xa9, 0x95, 0x9f, 0xc3, 0xc4, 0x92, 0x8f,
+				0xd2, 0x30, 0x18, 0x56, 0x29, 0x93, 0x74, 0x50,
+				0xbd, 0xa6, 0x8d, 0x88, 0xf6, 0x03, 0xd6, 0x16,
+				0xd9, 0x9d, 0x01, 0x82, 0xbe, 0x08, 0x13, 0xec,
+				0x9f, 0xb4, 0xb1, 0x18, 0xbc, 0x14, 0x09, 0x31,
+				0xad,
+			],
+			x = [
+				0xef, 0xcb, 0x7a, 0x72, 0x9c, 0xb4, 0x6a, 0xe6,
+				0xb4, 0x66, 0xa6, 0x8c, 0x1a, 0x18, 0x64, 0x5c,
+				0xd8, 0xdc, 0x9d, 0xa2, 0x5b, 0x48, 0x45, 0xcd,
+				0x68, 0x0c, 0x5f, 0x93, 0x1b, 0x74, 0x42, 0x0b,
+			],
+			y = [
+				0x7b, 0x6b, 0x8f, 0x14, 0x8b, 0x7a, 0x5d, 0xbe,
+				0xa5, 0xd0, 0x6b, 0x7a, 0x3c, 0xaa, 0x6f, 0x4b,
+				0x2a, 0xeb, 0x52, 0x7e, 0x80, 0xdb, 0xc5, 0x95,
+				0x73, 0x91, 0xab, 0x3d, 0x29, 0xbc, 0xbd, 0x74,
+			],
+			result = 0,
+			expected = [],
+		},
+		// invalid b
+		multc {
+			a = [
+				0x04, 0xee, 0x19, 0x71, 0x69, 0xae, 0xbd, 0x94,
+				0xc7, 0x8d, 0x4e, 0x7f, 0x05, 0x06, 0xb4, 0xdf,
+				0x60, 0x43, 0x22, 0x2c, 0x50, 0x8d, 0x33, 0x8a,
+				0x5a, 0xd0, 0x02, 0x60, 0x95, 0x5e, 0xda, 0x31,
+				0x64, 0x4c, 0x7e, 0x63, 0xa9, 0xa0, 0xaf, 0x56,
+				0xbb, 0x7c, 0x03, 0x84, 0x8a, 0x44, 0x30, 0xfd,
+				0x39, 0x1d, 0xc0, 0x0f, 0x5c, 0x50, 0xcf, 0xda,
+				0xf1, 0x99, 0x69, 0x8e, 0x77, 0x62, 0x56, 0x34,
+				0xce,
+			],
+			b = [
+				0x04, 0xbd, 0x52, 0x07, 0x83, 0x40, 0x88, 0x6a,
+				0xf5, 0x24, 0xd8, 0x22, 0x13, 0x4f, 0xc3, 0xf3,
+				0xf3, 0xca, 0xe1, 0xd3, 0x5e, 0x01, 0x95, 0x82,
+				0x3f, 0xa9, 0x95, 0x9f, 0xc3, 0xc4, 0x92, 0x8f,
+				0x52, 0x30, 0x18, 0x56, 0x29, 0x93, 0x74, 0x50,
+				0x1d, 0xa6, 0x8d, 0x88, 0xf6, 0x03, 0xd6, 0x16,
+				0xd9, 0x9d, 0x01, 0x82, 0xbe, 0x08, 0x13, 0xec,
+				0x9f, 0xb4, 0xb1, 0x18, 0xbc, 0x14, 0x09, 0x31,
+				0xad,
+			],
+			x = [
+				0xef, 0xcb, 0x7a, 0x72, 0x9c, 0xb4, 0x6a, 0xe6,
+				0xb4, 0x66, 0xa6, 0x8c, 0x1a, 0x18, 0x64, 0x5c,
+				0xd8, 0xdc, 0x9d, 0xa2, 0x5b, 0x48, 0x45, 0xcd,
+				0x68, 0x0c, 0x5f, 0x93, 0x1b, 0x74, 0x42, 0x0b,
+			],
+			y = [
+				0x7b, 0x6b, 0x8f, 0x14, 0x8b, 0x7a, 0x5d, 0xbe,
+				0xa5, 0xd0, 0x6b, 0x7a, 0x3c, 0xaa, 0x6f, 0x4b,
+				0x2a, 0xeb, 0x52, 0x7e, 0x80, 0xdb, 0xc5, 0x95,
+				0x73, 0x91, 0xab, 0x3d, 0x29, 0xbc, 0xbd, 0x74,
+			],
+			result = 0,
+			expected = [],
+		},
+		// invalid a and b
+		multc {
+			a = [
+				0x04, 0xee, 0x19, 0x71, 0x69, 0xae, 0xbd, 0x94,
+				0xc7, 0x8d, 0x4e, 0x0f, 0x05, 0x06, 0xb4, 0xdf,
+				0x60, 0x43, 0x22, 0x0c, 0x50, 0x8d, 0x33, 0x8a,
+				0x5a, 0xd0, 0x02, 0x00, 0x95, 0x5e, 0xda, 0x31,
+				0x64, 0x4c, 0x7e, 0x03, 0xa9, 0xa0, 0xaf, 0x56,
+				0xbb, 0x7c, 0x03, 0x04, 0x8a, 0x44, 0x30, 0xfd,
+				0x39, 0x1d, 0xc0, 0x0f, 0x5c, 0x50, 0xcf, 0xda,
+				0xf1, 0x99, 0x69, 0x0e, 0x77, 0x62, 0x56, 0x34,
+				0xce,
+			],
+			b = [
+				0x04, 0xbd, 0x52, 0x07, 0x83, 0xf0, 0x88, 0x6a,
+				0xa5, 0x24, 0xd8, 0x22, 0x13, 0xff, 0xc3, 0xf3,
+				0x03, 0xca, 0xe1, 0xd3, 0x5e, 0xf1, 0x95, 0x82,
+				0x5f, 0xa9, 0x95, 0x9f, 0xc3, 0xf4, 0x92, 0x8f,
+				0xd2, 0x30, 0x18, 0x56, 0x29, 0xf3, 0x74, 0x50,
+				0xbd, 0xa6, 0x8d, 0x88, 0xf6, 0xf3, 0xd6, 0x16,
+				0xd9, 0x9d, 0x01, 0x82, 0xbe, 0xf8, 0x13, 0xec,
+				0x9f, 0xb4, 0xb1, 0x18, 0xbc, 0x14, 0x09, 0x31,
+				0xad,
+			],
+			x = [
+				0xef, 0xcb, 0x7a, 0x72, 0x9c, 0xb4, 0x6a, 0xe6,
+				0xb4, 0x66, 0xa6, 0x8c, 0x1a, 0x18, 0x64, 0x5c,
+				0xd8, 0xdc, 0x9d, 0xa2, 0x5b, 0x48, 0x45, 0xcd,
+				0x68, 0x0c, 0x5f, 0x93, 0x1b, 0x74, 0x42, 0x0b,
+			],
+			y = [
+				0x7b, 0x6b, 0x8f, 0x14, 0x8b, 0x7a, 0x5d, 0xbe,
+				0xa5, 0xd0, 0x6b, 0x7a, 0x3c, 0xaa, 0x6f, 0x4b,
+				0x2a, 0xeb, 0x52, 0x7e, 0x80, 0xdb, 0xc5, 0x95,
+				0x73, 0x91, 0xab, 0x3d, 0x29, 0xbc, 0xbd, 0x74,
+			],
+			result = 0,
+			expected = [],
+		}
+	];
+
+	tmuladd(p256, tcs);
+};
+
+@test fn p384_mulgen() void = {
+	let c = p384;
+
+	// multiply by 1
+	let m1: [P384_SCALARSZ]u8 = [0...];
+	m1[len(m1) - 1] = 1;
+
+	let g = alloc(P384_G);
+	defer free(g);
+	assert(c.mul(g, m1) == 1);
+	assert(bytes::equal(g, P384_G));
+
+	assert(c.mulgen(g, m1) == 97);
+	assert(bytes::equal(g, P384_G));
+
+	// multiply by order - 1
+	let o = alloc(P384_N);
+	defer free(o);
+	o[len(o) - 1] -= 1;
+
+	const expected: [_]u8 = [
+		0x04, 0xaa, 0x87, 0xca, 0x22, 0xbe, 0x8b, 0x05, 0x37, 0x8e,
+		0xb1, 0xc7, 0x1e, 0xf3, 0x20, 0xad, 0x74, 0x6e, 0x1d, 0x3b,
+		0x62, 0x8b, 0xa7, 0x9b, 0x98, 0x59, 0xf7, 0x41, 0xe0, 0x82,
+		0x54, 0x2a, 0x38, 0x55, 0x02, 0xf2, 0x5d, 0xbf, 0x55, 0x29,
+		0x6c, 0x3a, 0x54, 0x5e, 0x38, 0x72, 0x76, 0x0a, 0xb7, 0xc9,
+		0xe8, 0x21, 0xb5, 0x69, 0xd9, 0xd3, 0x90, 0xa2, 0x61, 0x67,
+		0x40, 0x6d, 0x6d, 0x23, 0xd6, 0x07, 0x0b, 0xe2, 0x42, 0xd7,
+		0x65, 0xeb, 0x83, 0x16, 0x25, 0xce, 0xec, 0x4a, 0x0f, 0x47,
+		0x3e, 0xf5, 0x9f, 0x4e, 0x30, 0xe2, 0x81, 0x7e, 0x62, 0x85,
+		0xbc, 0xe2, 0x84, 0x6f, 0x15, 0xf1, 0xa0,
+	];
+
+	assert(c.mul(g, o) == 1);
+	assert(bytes::equal(g, expected));
+
+	let g = alloc(P384_G);
+	defer free(g);
+	assert(c.mulgen(g, o) == 97);
+	assert(bytes::equal(g, expected));
+
+	// random example
+	let priv: [_]u8 = [
+		0xde, 0x5c, 0x88, 0x05, 0x42, 0xa0, 0x71, 0xe2, 0xf6, 0xfe,
+		0xd0, 0xdc, 0x80, 0x07, 0x37, 0xc4, 0x35, 0xa6, 0x29, 0x48,
+		0x85, 0x70, 0x4f, 0x54, 0x1c, 0x41, 0x89, 0xaf, 0xf6, 0xbc,
+		0xb5, 0x19, 0x85, 0x70, 0x4f, 0x54, 0x1c, 0x41, 0x89, 0xaf,
+		0xf6, 0xbc, 0xb5, 0x19, 0xb5, 0x19, 0xb5, 0x19,
+	];
+	const expected: [_]u8 = [
+		0x04, 0xb6, 0xb4, 0x77, 0x6e, 0x08, 0x86, 0x64, 0x55, 0x3a,
+		0x64, 0xe0, 0xa3, 0xb0, 0x56, 0x39, 0x18, 0x4c, 0x5b, 0x15,
+		0x3b, 0x19, 0xd0, 0x99, 0xc5, 0xd2, 0x7a, 0x4a, 0x23, 0xb9,
+		0xf7, 0x37, 0x95, 0x4b, 0xe0, 0x97, 0x4c, 0x72, 0x16, 0x34,
+		0x76, 0xf6, 0xce, 0x9e, 0xb7, 0x52, 0xc9, 0x33, 0xed, 0x26,
+		0x5b, 0x7c, 0xb0, 0xd8, 0xe8, 0x8f, 0xc1, 0xb2, 0xd4, 0xbe,
+		0xe3, 0x20, 0xc7, 0x82, 0xc7, 0x71, 0x3d, 0xaf, 0xad, 0xdb,
+		0xe8, 0x56, 0xba, 0xad, 0x29, 0x76, 0x4c, 0x12, 0x24, 0xe9,
+		0x59, 0xca, 0xef, 0x09, 0x48, 0x0d, 0x40, 0xb1, 0xce, 0x5f,
+		0xcf, 0x6b, 0x62, 0xfd, 0xe5, 0x4b, 0xcc,
+	];
+
+	let g = alloc(P384_G);
+	defer free(g);
+	assert(c.mul(g, priv) == 1);
+	assert(bytes::equal(g, expected));
+
+	let r: [97]u8 = [0...];
+	assert(c.mulgen(r, priv) == 97);
+	assert(bytes::equal(r, expected));
+};
+
+@test fn p521_mulgen() void = {
+	let c = p521;
+
+	// multiply by 1
+	let m1: [P521_SCALARSZ]u8 = [0...];
+	m1[len(m1) - 1] = 1;
+
+	let g = alloc(P521_G);
+	defer free(g);
+	assert(c.mul(g, m1) == 1);
+	assert(bytes::equal(g, P521_G));
+
+	assert(c.mulgen(g, m1) == 133);
+	assert(bytes::equal(g, P521_G));
+
+	// multiply by order - 1
+	let o = alloc(P521_N);
+	defer free(o);
+	o[len(o) - 1] -= 1;
+
+	const expected: [_]u8 = [
+		0x04, 0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, 0xe9,
+		0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95, 0xb4, 0x42, 0x9c,
+		0x64, 0x81, 0x39, 0x05, 0x3f, 0xb5, 0x21, 0xf8, 0x28, 0xaf,
+		0x60, 0x6b, 0x4d, 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef,
+		0xe7, 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff, 0xa8,
+		0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a, 0x42, 0x9b, 0xf9,
+		0x7e, 0x7e, 0x31, 0xc2, 0xe5, 0xbd, 0x66, 0x00, 0xe7, 0xc6,
+		0xd6, 0x95, 0x87, 0x65, 0xc4, 0x3f, 0xfb, 0xa3, 0x75, 0xa0,
+		0x4b, 0xd3, 0x82, 0xe4, 0x26, 0x67, 0x0a, 0xbb, 0xb6, 0xa8,
+		0x64, 0xbb, 0x97, 0xe8, 0x50, 0x42, 0xe8, 0xd8, 0xc1, 0x99,
+		0xd3, 0x68, 0x11, 0x8d, 0x66, 0xa1, 0x0b, 0xd9, 0xbf, 0x3a,
+		0xaf, 0x46, 0xfe, 0xc0, 0x52, 0xf8, 0x9e, 0xca, 0xc3, 0x8f,
+		0x79, 0x5d, 0x8d, 0x3d, 0xbf, 0x77, 0x41, 0x6b, 0x89, 0x60,
+		0x2e, 0x99, 0xaf,
+	];
+
+	assert(c.mul(g, o) == 1);
+	assert(bytes::equal(g, expected));
+
+	let g = alloc(P521_G);
+	defer free(g);
+	assert(c.mulgen(g, o) == 133);
+	assert(bytes::equal(g, expected));
+
+	// random example
+	let priv: [_]u8 = [
+		0x01, 0x1f, 0xd5, 0x29, 0xff, 0x28, 0x24, 0x93, 0x43, 0x23,
+		0x3c, 0x0f, 0xe4, 0x15, 0x6a, 0xea, 0x1c, 0x95, 0xe6, 0xf1,
+		0x5e, 0x19, 0x30, 0xfe, 0xb4, 0xea, 0x0d, 0x24, 0xb3, 0x67,
+		0xa0, 0xeb, 0xcf, 0xed, 0x5c, 0xcf, 0xcf, 0x9f, 0x9d, 0x48,
+		0x04, 0x8b, 0xb1, 0x56, 0x53, 0xba, 0xa4, 0x86, 0x01, 0x29,
+		0x61, 0x21, 0xca, 0xd7, 0x63, 0x84, 0x35, 0x45, 0xc3, 0x9b,
+		0x55, 0xff, 0x9a, 0x4f, 0x27, 0x03,
+	];
+	const expected: [_]u8 = [
+		0x04, 0x00, 0x9f, 0x72, 0x15, 0x21, 0x0c, 0xdd, 0x22, 0x24,
+		0x16, 0xbb, 0x18, 0xa6, 0x46, 0x21, 0xee, 0x4f, 0x73, 0x1e,
+		0x9e, 0x03, 0xaf, 0x79, 0xc7, 0x1f, 0xf0, 0x33, 0x1c, 0xa6,
+		0xa1, 0x13, 0x98, 0x95, 0x6e, 0x88, 0x44, 0x6b, 0xa4, 0xbd,
+		0x4d, 0x49, 0x9e, 0xcb, 0x96, 0x0e, 0xdf, 0x67, 0x89, 0xca,
+		0xd5, 0xc3, 0x9e, 0x99, 0xa8, 0xc9, 0x00, 0x4b, 0x2c, 0x49,
+		0x57, 0xd7, 0xee, 0xf4, 0x6e, 0x07, 0xbb, 0x01, 0x35, 0xe2,
+		0xa4, 0xd6, 0x60, 0x99, 0x2c, 0x62, 0xad, 0xab, 0x89, 0xb5,
+		0xe7, 0xb6, 0x4b, 0x44, 0x89, 0xcf, 0xb0, 0x19, 0x29, 0xf8,
+		0x54, 0x76, 0x07, 0x40, 0x3a, 0xee, 0x41, 0x0c, 0x64, 0xeb,
+		0xda, 0x1e, 0x22, 0x4a, 0xee, 0xdb, 0x16, 0xa0, 0x5d, 0xe1,
+		0x95, 0x2f, 0xe3, 0xbc, 0x53, 0xd3, 0xf4, 0x36, 0x76, 0x03,
+		0x1f, 0xd8, 0xc6, 0xf4, 0x23, 0x4f, 0x45, 0x4e, 0x11, 0xeb,
+		0xb2, 0xb1, 0x62,
+	];
+
+	let g = alloc(P521_G);
+	defer free(g);
+	assert(c.mul(g, priv) == 1);
+	assert(bytes::equal(g, expected));
+
+	let r: [133]u8 = [0...];
+	assert(c.mulgen(r, priv) == 133);
+	assert(bytes::equal(r, expected));
+};
diff --git a/crypto/ec/genkeys+test.ha b/crypto/ec/genkeys+test.ha
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: MPL-2.0
+// (c) Hare authors <https://harelang.org>
+
+use bytes;
+use io;
+use memio;
+
+
+@test fn keygen_p256() void = edgecases(p256);
+
+fn edgecases(c: *curve) void = {
+	const scalarsz = len(c.order());
+	let priv: [MAX_SCALARSZ]u8 = [0...];
+	let priv = priv[..scalarsz];
+
+	let zero: [MAX_SCALARSZ]u8 = [0...];
+	let zero = zero[..scalarsz];
+
+	let rnd = memio::fixed(zero);
+	assert(keygen(c, priv, &rnd) is io::error);
+
+	let rnd = memio::fixed(c.order());
+	assert(keygen(c, priv, &rnd) is io::error);
+
+	let sub1: [MAX_SCALARSZ]u8 = [0...];
+	let sub1 = sub1[..scalarsz];
+
+	sub1[..] = c.order()[..];
+	sub1[len(sub1) - 1] -= 1;
+
+	let rnd = memio::fixed(sub1);
+	assert(keygen(c, priv, &rnd): size == scalarsz);
+	assert(bytes::equal(sub1, priv));
+};
diff --git a/crypto/ec/p256.ha b/crypto/ec/p256.ha
@@ -0,0 +1,1252 @@
+// SPDX-License-Identifier: MPL-2.0
+// (c) Hare authors <https://harelang.org>
+
+// Ported from BearSSL
+//
+// Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+use bytes;
+use crypto::math::*;
+
+
+// Note from the BearSSL documentation:
+//
+// The ec_p256_m31 implementation supports P-256 with specialised code,
+// including modular reduction routines that leverage the special format of the
+// field modulus, and internal split of data as sequences of 30-bit words, which
+// helps with carry propagation. ec_p256_m31 also includes fixed point
+// optimisations, for the common case of multiplying the conventional generator
+// point. These implementations are faster than the generic "i31" code, but with
+// a larger code footprint.
+//
+// Convert an integer from unsigned big-endian encoding to a sequence of 30-bit
+// words in little-endian order. The final "partial" word is returned.
+fn be8tole30(dest: []u32, src: []u8) u32 = {
+	let acc: u32 = 0;
+	let acclen: u32 = 0;
+	let destpos = 0;
+
+	for (let i = len(src); i > 0; i -= 1) {
+		let b = src[i - 1]: u32;
+		if (acclen < 22) {
+			acc |= b << acclen;
+			acclen += 8;
+		} else {
+			dest[destpos] = (acc | (b << acclen)) & 0x3FFFFFFF;
+			destpos += 1;
+			acc = b >> (30 - acclen);
+			acclen -= 22;
+		};
+	};
+	return acc;
+};
+
+// Convert an integer (30-bit words, little-endian) to unsigned
+// big-endian encoding. The total encoding length is provided; all
+// the destination bytes will be filled.
+fn le30tobe8(dest: []u8, src: []u32) void = {
+	let acc: u32 = 0;
+	let acclen: u32 = 0;
+	let srcpos: size = 0;
+
+	for (let i = len(dest); i > 0; i -= 1) {
+		if (acclen < 8) {
+			let w = src[srcpos];
+			srcpos += 1;
+
+			dest[i - 1] = (acc | (w << acclen)): u8;
+			acc = w >> (8 - acclen);
+			acclen += 22;
+		} else {
+			dest[i - 1] = acc: u8;
+			acc >>= 8;
+			acclen -= 8;
+		};
+	};
+};
+
+@test fn be8tole30() void = {
+	let be8: [6]u8 = [0x11, 0x22, 0xF3, 0x44, 0x55, 0x66];
+	let le30result: [2]u32 = [0...];
+	let be8result: [6]u8 = [0...];
+
+	le30result[1] = be8tole30(le30result, be8);
+	le30tobe8(be8result, le30result);
+
+	assert(bytes::equal(be8, be8result));
+};
+
+fn arsh(x: u32, n: u32) u32 = (*(&x: *i32) >> n: i32): u32;
+fn arshw(x: u64, n: u32) u64 = (*(&x: *i64) >> n: i32): u64;
+
+@test fn arsh() void = assert(arsh(0x80000000u32, 2) == 0xe0000000);
+
+// Multiply two integers. Source integers are represented as arrays of
+// nine 30-bit words, for values up to 2^270-1. Result is encoded over
+// 18 words of 30 bits each.
+fn mul9(d: []u32, a: []u32, b: []u32) void = {
+	// Maximum intermediate result is no more than
+	// 10376293531797946367, which fits in 64 bits. Reason:
+	//
+	//   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
+	//   10376293531797946367 < 9663676407 * 2^30
+	//
+	// Thus, adding together 9 products of 30-bit integers, with
+	// a carry of at most 9663676406, yields an integer that fits
+	// on 64 bits and generates a carry of at most 9663676406.
+	let t: [17]u64 = [0...];
+
+	t[0] = mulu32(a[0], b[0]);
+	t[1] = mulu32(a[0], b[1])
+		+ mulu32(a[1], b[0]);
+	t[2] = mulu32(a[0], b[2])
+		+ mulu32(a[1], b[1])
+		+ mulu32(a[2], b[0]);
+	t[3] = mulu32(a[0], b[3])
+		+ mulu32(a[1], b[2])
+		+ mulu32(a[2], b[1])
+		+ mulu32(a[3], b[0]);
+	t[4] = mulu32(a[0], b[4])
+		+ mulu32(a[1], b[3])
+		+ mulu32(a[2], b[2])
+		+ mulu32(a[3], b[1])
+		+ mulu32(a[4], b[0]);
+	t[5] = mulu32(a[0], b[5])
+		+ mulu32(a[1], b[4])
+		+ mulu32(a[2], b[3])
+		+ mulu32(a[3], b[2])
+		+ mulu32(a[4], b[1])
+		+ mulu32(a[5], b[0]);
+	t[6] = mulu32(a[0], b[6])
+		+ mulu32(a[1], b[5])
+		+ mulu32(a[2], b[4])
+		+ mulu32(a[3], b[3])
+		+ mulu32(a[4], b[2])
+		+ mulu32(a[5], b[1])
+		+ mulu32(a[6], b[0]);
+	t[7] = mulu32(a[0], b[7])
+		+ mulu32(a[1], b[6])
+		+ mulu32(a[2], b[5])
+		+ mulu32(a[3], b[4])
+		+ mulu32(a[4], b[3])
+		+ mulu32(a[5], b[2])
+		+ mulu32(a[6], b[1])
+		+ mulu32(a[7], b[0]);
+	t[8] = mulu32(a[0], b[8])
+		+ mulu32(a[1], b[7])
+		+ mulu32(a[2], b[6])
+		+ mulu32(a[3], b[5])
+		+ mulu32(a[4], b[4])
+		+ mulu32(a[5], b[3])
+		+ mulu32(a[6], b[2])
+		+ mulu32(a[7], b[1])
+		+ mulu32(a[8], b[0]);
+	t[9] = mulu32(a[1], b[8])
+		+ mulu32(a[2], b[7])
+		+ mulu32(a[3], b[6])
+		+ mulu32(a[4], b[5])
+		+ mulu32(a[5], b[4])
+		+ mulu32(a[6], b[3])
+		+ mulu32(a[7], b[2])
+		+ mulu32(a[8], b[1]);
+	t[10] = mulu32(a[2], b[8])
+		+ mulu32(a[3], b[7])
+		+ mulu32(a[4], b[6])
+		+ mulu32(a[5], b[5])
+		+ mulu32(a[6], b[4])
+		+ mulu32(a[7], b[3])
+		+ mulu32(a[8], b[2]);
+	t[11] = mulu32(a[3], b[8])
+		+ mulu32(a[4], b[7])
+		+ mulu32(a[5], b[6])
+		+ mulu32(a[6], b[5])
+		+ mulu32(a[7], b[4])
+		+ mulu32(a[8], b[3]);
+	t[12] = mulu32(a[4], b[8])
+		+ mulu32(a[5], b[7])
+		+ mulu32(a[6], b[6])
+		+ mulu32(a[7], b[5])
+		+ mulu32(a[8], b[4]);
+	t[13] = mulu32(a[5], b[8])
+		+ mulu32(a[6], b[7])
+		+ mulu32(a[7], b[6])
+		+ mulu32(a[8], b[5]);
+	t[14] = mulu32(a[6], b[8])
+		+ mulu32(a[7], b[7])
+		+ mulu32(a[8], b[6]);
+	t[15] = mulu32(a[7], b[8])
+		+ mulu32(a[8], b[7]);
+	t[16] = mulu32(a[8], b[8]);
+
+	// Propagate carries.
+	let cc: u64 = 0;
+	for (let i = 0z; i < 17; i += 1) {
+		let w = t[i] + cc;
+		d[i] = w: u32 & 0x3FFFFFFF;
+		cc = w >> 30;
+	};
+	d[17] = cc: u32;
+};
+
+// Square a 270-bit integer, represented as an array of nine 30-bit words.
+// Result uses 18 words of 30 bits each.
+fn square9(d: []u32, a: []u32) void = {
+	let t: [17]u64 = [0...];
+
+	t[0] = mulu32(a[0], a[0]);
+	t[1] = ((mulu32(a[0], a[1])) << 1);
+	t[2] = mulu32(a[1], a[1])
+		+ ((mulu32(a[0], a[2])) << 1);
+	t[3] = ((mulu32(a[0], a[3])
+		+ mulu32(a[1], a[2])) << 1);
+	t[4] = mulu32(a[2], a[2])
+		+ ((mulu32(a[0], a[4])
+		+ mulu32(a[1], a[3])) << 1);
+	t[5] = ((mulu32(a[0], a[5])
+		+ mulu32(a[1], a[4])
+		+ mulu32(a[2], a[3])) << 1);
+	t[6] = mulu32(a[3], a[3])
+		+ ((mulu32(a[0], a[6])
+		+ mulu32(a[1], a[5])
+		+ mulu32(a[2], a[4])) << 1);
+	t[7] = ((mulu32(a[0], a[7])
+		+ mulu32(a[1], a[6])
+		+ mulu32(a[2], a[5])
+		+ mulu32(a[3], a[4])) << 1);
+	t[8] = mulu32(a[4], a[4])
+		+ ((mulu32(a[0], a[8])
+		+ mulu32(a[1], a[7])
+		+ mulu32(a[2], a[6])
+		+ mulu32(a[3], a[5])) << 1);
+	t[9] = ((mulu32(a[1], a[8])
+		+ mulu32(a[2], a[7])
+		+ mulu32(a[3], a[6])
+		+ mulu32(a[4], a[5])) << 1);
+	t[10] = mulu32(a[5], a[5])
+		+ ((mulu32(a[2], a[8])
+		+ mulu32(a[3], a[7])
+		+ mulu32(a[4], a[6])) << 1);
+	t[11] = ((mulu32(a[3], a[8])
+		+ mulu32(a[4], a[7])
+		+ mulu32(a[5], a[6])) << 1);
+	t[12] = mulu32(a[6], a[6])
+		+ ((mulu32(a[4], a[8])
+		+ mulu32(a[5], a[7])) << 1);
+	t[13] = ((mulu32(a[5], a[8])
+		+ mulu32(a[6], a[7])) << 1);
+	t[14] = mulu32(a[7], a[7])
+		+ ((mulu32(a[6], a[8])) << 1);
+	t[15] = ((mulu32(a[7], a[8])) << 1);
+	t[16] = mulu32(a[8], a[8]);
+
+	// Propagate carries.
+	let cc: u64 = 0;
+	for (let i = 0z; i < 17; i += 1) {
+		let w = t[i] + cc;
+		d[i] = w: u32 & 0x3FFFFFFF;
+		cc = w >> 30;
+	};
+	d[17] = cc: u32;
+};
+
+// Base field modulus for P-256.
+const F256: [_]u32 = [
+	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
+	0x00001000, 0x3FFFC000, 0x0000FFFF
+];
+
+// The 'b' curve equation coefficient for P-256.
+const P256_B: [_]u32 = [
+	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65, 0x2EF555DA,
+	0x293E7B3E, 0x0D762A8E, 0x00005AC6
+];
+
+// Addition in the field. Source operands shall fit on 257 bits; output
+// will be lower than twice the modulus.
+fn add_f256(d: []u32, a: []u32, b: []u32) void = {
+	let w: u32 = 0;
+
+	let cc: u32 = 0;
+	for (let i = 0z; i < 9; i += 1) {
+		w = a[i] + b[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = w >> 30;
+	};
+	w >>= 16;
+	d[8] &= 0xFFFF;
+	d[3] -= w << 6;
+	d[6] -= w << 12;
+	d[7] += w << 14;
+	cc = w;
+	for (let i = 0z; i < 9; i += 1) {
+		w = d[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = arsh(w, 30);
+	};
+};
+
+// Subtraction in the field. Source operands shall be smaller than twice
+// the modulus; the result will fulfil the same property.
+fn sub_f256(d: []u32, a: []u32, b: []u32) void = {
+	let w: u32 = 0;
+	let cc: u32 = 0;
+
+	// We really compute a - b + 2*p to make sure that the result is
+	// positive.
+	w = a[0] - b[0] - 0x00002;
+	d[0] = w & 0x3FFFFFFF;
+	w = a[1] - b[1] + arsh(w, 30);
+	d[1] = w & 0x3FFFFFFF;
+	w = a[2] - b[2] + arsh(w, 30);
+	d[2] = w & 0x3FFFFFFF;
+	w = a[3] - b[3] + arsh(w, 30) + 0x00080;
+	d[3] = w & 0x3FFFFFFF;
+	w = a[4] - b[4] + arsh(w, 30);
+	d[4] = w & 0x3FFFFFFF;
+	w = a[5] - b[5] + arsh(w, 30);
+	d[5] = w & 0x3FFFFFFF;
+	w = a[6] - b[6] + arsh(w, 30) + 0x02000;
+	d[6] = w & 0x3FFFFFFF;
+	w = a[7] - b[7] + arsh(w, 30) - 0x08000;
+	d[7] = w & 0x3FFFFFFF;
+	w = a[8] - b[8] + arsh(w, 30) + 0x20000;
+	d[8] = w & 0xFFFF;
+	w >>= 16;
+	d[8] &= 0xFFFF;
+	d[3] -= w << 6;
+	d[6] -= w << 12;
+	d[7] += w << 14;
+	cc = w;
+	for (let i = 0z; i < 9; i += 1) {
+		w = d[i] + cc;
+		d[i] = w & 0x3FFFFFFF;
+		cc = arsh(w, 30);
+	};
+};
+
+// Compute a multiplication in F256. Source operands shall be less than
+// twice the modulus.
+fn mul_f256(d: []u32, a: []u32, b: []u32) void = {
+	let t: [18]u32 = [0...];
+	let s: [18]u64 = [0...];
+	let x: u64 = 0;
+
+	mul9(t, a, b);
+
+	// Modular reduction: each high word in added/subtracted where
+	// necessary.
+	//
+	// The modulus is:
+	//    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+	// Therefore:
+	//    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	//
+	// For a word x at bit offset n (n >= 256), we have:
+	//    x*2^n = x*2^(n-32) - x*2^(n-64)
+	//	    - x*2^(n - 160) + x*2^(n-256) mod p
+	//
+	// Thus, we can nullify the high word if we reinject it at some
+	// proper emplacements.
+	//
+	// We use 64-bit intermediate words to allow for carries to
+	// accumulate easily, before performing the final propagation.
+	for (let i = 0; i < 18; i += 1) {
+		s[i] = t[i];
+	};
+
+	for (let i = 17; i >= 9; i -= 1) {
+		let y = s[i];
+		s[i - 1] += arshw(y, 2);
+		s[i - 2] += (y << 28) & 0x3FFFFFFF;
+		s[i - 2] -= arshw(y, 4);
+		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
+		s[i - 5] -= arshw(y, 10);
+		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
+		s[i - 8] += arshw(y, 16);
+		s[i - 9] += (y << 14) & 0x3FFFFFFF;
+	};
+
+	// Carry propagation must be signed. Moreover, we may have overdone
+	// it a bit, and obtain a negative result.
+	//
+	// The loop above ran 9 times; each time, each word was augmented
+	// by at most one extra word (in absolute value). Thus, the top
+	// word must in fine fit in 39 bits, so the carry below will fit
+	// on 9 bits.
+	let cc: u64 = 0;
+	for (let i = 0z; i < 9; i += 1) {
+		x = s[i] + cc;
+		d[i] = x: u32 & 0x3FFFFFFF;
+		cc = arshw(x, 30);
+	};
+
+	// All nine words fit on 30 bits, but there may be an extra
+	// carry for a few bits (at most 9), and that carry may be
+	// negative. Moreover, we want the result to fit on 257 bits.
+	// The two lines below ensure that the word in d[] has length
+	// 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
+	// significant length of cc is less than 24 bits, so we will be
+	// able to switch to 32-bit operations.
+	cc = arshw(x, 16);
+	d[8] &= 0xFFFF;
+
+	// One extra round of reduction, for cc*2^256, which means
+	// adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+	// value. If cc is negative, then it may happen (rarely, but
+	// not neglectibly so) that the result would be negative. In
+	// order to avoid that, if cc is negative, then we add the
+	// modulus once. Note that if cc is negative, then propagating
+	// that carry must yield a value lower than the modulus, so
+	// adding the modulus once will keep the final result under
+	// twice the modulus.
+	let z = cc: u32;
+	d[3] -= z << 6;
+	d[6] -= (z << 12) & 0x3FFFFFFF;
+	d[7] -= arsh(z, 18);
+	d[7] += (z << 14) & 0x3FFFFFFF;
+	d[8] += arsh(z, 16);
+	let c = z >> 31;
+	d[0] -= c;
+	d[3] += c << 6;
+	d[6] += c << 12;
+	d[7] -= c << 14;
+	d[8] += c << 16;
+
+	for (let i = 0z; i < 9; i += 1) {
+		let w = d[i] + z;
+		d[i] = w & 0x3FFFFFFF;
+		z = arsh(w, 30);
+	};
+};
+
+
+// Compute a square in F256. Source operand shall be less than
+// twice the modulus.
+fn square_f256(d: []u32, a: []u32) void = {
+	let t: [18]u32 = [0...];
+	let s: [18]u64 = [0...];
+
+	square9(t, a);
+
+	// Modular reduction: each high word in added/subtracted where
+	// necessary.
+	//
+	// The modulus is:
+	//    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
+	// Therefore:
+	//    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
+	//
+	// For a word x at bit offset n (n >= 256), we have:
+	//    x*2^n = x*2^(n-32) - x*2^(n-64)
+	//	    - x*2^(n - 160) + x*2^(n-256) mod p
+	//
+	// Thus, we can nullify the high word if we reinject it at some
+	// proper emplacements.
+	//
+	// We use 64-bit intermediate words to allow for carries to
+	// accumulate easily, before performing the final propagation.
+	for (let i = 0; i < 18; i += 1) {
+		s[i] = t[i];
+	};
+
+	for (let i = 17; i >= 9; i -= 1) {
+		let y = s[i];
+		s[i - 1] += arshw(y, 2);
+		s[i - 2] += (y << 28) & 0x3FFFFFFF;
+		s[i - 2] -= arshw(y, 4);
+		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
+		s[i - 5] -= arshw(y, 10);
+		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
+		s[i - 8] += arshw(y, 16);
+		s[i - 9] += (y << 14) & 0x3FFFFFFF;
+	};
+
+	// Carry propagation must be signed. Moreover, we may have overdone
+	// it a bit, and obtain a negative result.
+	//
+	// The loop above ran 9 times; each time, each word was augmented
+	// by at most one extra word (in absolute value). Thus, the top
+	// word must in fine fit in 39 bits, so the carry below will fit
+	// on 9 bits.
+	let cc: u64 = 0;
+	let x: u64 = 0;
+	for (let i = 0; i < 9; i += 1) {
+		x = s[i] + cc;
+		d[i] = x: u32 & 0x3FFFFFFF;
+		cc = arshw(x, 30);
+	};
+
+	// All nine words fit on 30 bits, but there may be an extra
+	// carry for a few bits (at most 9), and that carry may be
+	// negative. Moreover, we want the result to fit on 257 bits.
+	// The two lines below ensure that the word in d[] has length
+	// 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
+	// significant length of cc is less than 24 bits, so we will be
+	// able to switch to 32-bit operations.
+	cc = arshw(x, 16);
+	d[8] &= 0xFFFF;
+
+	// One extra round of reduction, for cc*2^256, which means
+	// adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
+	// value. If cc is negative, then it may happen (rarely, but
+	// not neglectibly so) that the result would be negative. In
+	// order to avoid that, if cc is negative, then we add the
+	// modulus once. Note that if cc is negative, then propagating
+	// that carry must yield a value lower than the modulus, so
+	// adding the modulus once will keep the final result under
+	// twice the modulus.
+	let z = cc: u32;
+	d[3] -= z << 6;
+	d[6] -= (z << 12) & 0x3FFFFFFF;
+	d[7] -= arsh(z, 18);
+	d[7] += (z << 14) & 0x3FFFFFFF;
+	d[8] += arsh(z, 16);
+	let c = z >> 31;
+	d[0] -= c;
+	d[3] += c << 6;
+	d[6] += c << 12;
+	d[7] -= c << 14;
+	d[8] += c << 16;
+
+	for (let i = 0z; i < 9; i += 1) {
+		let w = d[i] + z;
+		d[i] = w & 0x3FFFFFFF;
+		z = arsh(w, 30);
+	};
+};
+
+// Perform a "final reduction" in field F256 (field for curve P-256).
+// The source value must be less than twice the modulus. If the value
+// is not lower than the modulus, then the modulus is subtracted and
+// this function returns 1; otherwise, it leaves it untouched and it
+// returns 0.
+fn reduce_final_f256(d: []u32) u32 = {
+	let t: [9]u32 = [0...];
+
+	let cc: u32 = 0;
+	for (let i = 0; i < 9; i += 1) {
+		let w = d[i] - F256[i] - cc;
+		cc = w >> 31;
+		t[i] = w & 0x3FFFFFFF;
+	};
+	cc ^= 1;
+	ccopyu32(cc, d, t);
+	return cc;
+};
+
+// Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
+// are such that:
+//   X = x / z^2
+//   Y = y / z^3
+// For the point at infinity, z = 0.
+// Each point thus admits many possible representations.
+//
+// Coordinates are represented in arrays of 32-bit integers, each holding
+// 30 bits of data. Values may also be slightly greater than the modulus,
+// but they will always be lower than twice the modulus.
+type p256_jacobian = struct {
+	x: [9]u32,
+	y: [9]u32,
+	z: [9]u32,
+};
+
+// Convert a point to affine coordinates:
+//  - If the point is the point at infinity, then all three coordinates
+//    are set to 0.
+//  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
+//    coordinates are the 'X' and 'Y' affine coordinates.
+// The coordinates are guaranteed to be lower than the modulus.
+fn p256_to_affine(p: *p256_jacobian) void = {
+	let t1: [9]u32 = [0...];
+	let t2: [9]u32 = [0...];
+
+	// Invert z with a modular exponentiation: the modulus is
+	// p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
+	// p-2. Exponent bit pattern (from high to low) is:
+	//  - 32 bits of value 1
+	//  - 31 bits of value 0
+	//  - 1 bit of value 1
+	//  - 96 bits of value 0
+	//  - 94 bits of value 1
+	//  - 1 bit of value 0
+	//  - 1 bit of value 1
+	// Thus, we precompute z^(2^31-1) to speed things up.
+	//
+	// If z = 0 (point at infinity) then the modular exponentiation
+	// will yield 0, which leads to the expected result (all three
+	// coordinates set to 0).
+
+	// A simple square-and-multiply for z^(2^31-1). We could save about
+	// two dozen multiplications here with an addition chain, but
+	// this would require a bit more code, and extra stack buffers.
+	t1[..] = p.z[..];
+	for (let i = 0; i < 30; i += 1) {
+		square_f256(t1, t1);
+		mul_f256(t1, t1, p.z);
+	};
+
+	// Square-and-multiply. Apart from the squarings, we have a few
+	// multiplications to set bits to 1; we multiply by the original z
+	// for setting 1 bit, and by t1 for setting 31 bits.
+	t2[..] = p.z[..];
+	for (let i = 1; i < 256; i += 1) {
+		square_f256(t2, t2);
+		switch (i) {
+		case 31, 190, 221, 252 =>
+			mul_f256(t2, t2, t1);
+		case 63, 253, 255 =>
+			mul_f256(t2, t2, p.z);
+		case =>
+			yield;
+		};
+	};
+
+	// Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
+	mul_f256(t1, t2, t2);
+	mul_f256(p.x, t1, p.x);
+	mul_f256(t1, t1, t2);
+	mul_f256(p.y, t1, p.y);
+	reduce_final_f256(p.x);
+	reduce_final_f256(p.y);
+
+	// Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
+	// this will set z to 1.
+	mul_f256(p.z, p.z, t2);
+	reduce_final_f256(p.z);
+};
+
+// Double a point in P-256. This function works for all valid points,
+// including the point at infinity.
+fn p256_double(q: *p256_jacobian) void = {
+	// Doubling formulas are:
+	//
+	//   s = 4*x*y^2
+	//   m = 3*(x + z^2)*(x - z^2)
+	//   x' = m^2 - 2*s
+	//   y' = m*(s - x') - 8*y^4
+	//   z' = 2*y*z
+	//
+	// These formulas work for all points, including points of order 2
+	// and points at infinity:
+	//   - If y = 0 then z' = 0. But there is no such point in P-256
+	//     anyway.
+	//   - If z = 0 then z' = 0.
+	let t1: [9]u32 = [0...];
+	let t2: [9]u32 = [0...];
+	let t3: [9]u32 = [0...];
+	let t4: [9]u32 = [0...];
+
+	// Compute z^2 in t1.
+	square_f256(t1, q.z);
+
+	// Compute x-z^2 in t2 and x+z^2 in t1.
+	add_f256(t2, q.x, t1);
+	sub_f256(t1, q.x, t1);
+
+	// Compute 3*(x+z^2)*(x-z^2) in t1.
+	mul_f256(t3, t1, t2);
+	add_f256(t1, t3, t3);
+	add_f256(t1, t3, t1);
+
+	// Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
+	square_f256(t3, q.y);
+	add_f256(t3, t3, t3);
+	mul_f256(t2, q.x, t3);
+	add_f256(t2, t2, t2);
+
+
+	// Compute x' = m^2 - 2*s.
+	square_f256(q.x, t1);
+	sub_f256(q.x, q.x, t2);
+	sub_f256(q.x, q.x, t2);
+
+	// Compute z' = 2*y*z.
+	mul_f256(t4, q.y, q.z);
+	add_f256(q.z, t4, t4);
+
+	// Compute y' = m*(s - x') - 8*y^4. Note that we already have
+	// 2*y^2 in t3.
+	sub_f256(t2, t2, q.x);
+	mul_f256(q.y, t1, t2);
+	square_f256(t4, t3);
+	add_f256(t4, t4, t4);
+	sub_f256(q.y, q.y, t4);
+};
+
+// Add point P2 to point P1.
+//
+// This function computes the wrong result in the following cases:
+//
+//   - If P1 == 0 but P2 != 0
+//   - If P1 != 0 but P2 == 0
+//   - If P1 == P2
+//
+// In all three cases, P1 is set to the point at infinity.
+//
+// Returned value is 0 if one of the following occurs:
+//
+//   - P1 and P2 have the same Y coordinate
+//   - P1 == 0 and P2 == 0
+//   - The Y coordinate of one of the points is 0 and the other point is
+//     the point at infinity.
+//
+// The third case cannot actually happen with valid points, since a point
+// with Y == 0 is a point of order 2, and there is no point of order 2 on
+// curve P-256.
+//
+// Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
+// can apply the following:
+//
+//   - If the result is not the point at infinity, then it is correct.
+//   - Otherwise, if the returned value is 1, then this is a case of
+//     P1+P2 == 0, so the result is indeed the point at infinity.
+//   - Otherwise, P1 == P2, so a "double" operation should have been
+//     performed.
+fn p256_add(p1: *p256_jacobian, p2: *p256_jacobian) u32 = {
+	// Addtions formulas are:
+	//
+	//   u1 = x1 * z2^2
+	//   u2 = x2 * z1^2
+	//   s1 = y1 * z2^3
+	//   s2 = y2 * z1^3
+	//   h = u2 - u1
+	//   r = s2 - s1
+	//   x3 = r^2 - h^3 - 2 * u1 * h^2
+	//   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	//   z3 = h * z1 * z2
+	let t1: [9]u32 = [0...];
+	let t2: [9]u32 = [0...];
+	let t3: [9]u32 = [0...];
+	let t4: [9]u32 = [0...];
+	let t5: [9]u32 = [0...];
+	let t6: [9]u32 = [0...];
+	let t7: [9]u32 = [0...];
+	let ret: u32 = 0;
+
+	// Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
+	square_f256(t3, p2.z);
+	mul_f256(t1, p1.x, t3);
+	mul_f256(t4, p2.z, t3);
+	mul_f256(t3, p1.y, t4);
+
+	// Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	square_f256(t4, p1.z);
+	mul_f256(t2, p2.x, t4);
+	mul_f256(t5, p1.z, t4);
+	mul_f256(t4, p2.y, t5);
+
+	// Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	// We need to test whether r is zero, so we will do some extra
+	// reduce.
+	sub_f256(t2, t2, t1);
+	sub_f256(t4, t4, t3);
+	reduce_final_f256(t4);
+	ret = 0;
+	for (let i = 0; i < 9; i += 1) {
+		ret |= t4[i];
+	};
+	ret = (ret | -ret) >> 31;
+
+	// Compute u1*h^2 (in t6) and h^3 (in t5);
+	square_f256(t7, t2);
+	mul_f256(t6, t1, t7);
+	mul_f256(t5, t7, t2);
+
+	// Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	square_f256(p1.x, t4);
+	sub_f256(p1.x, p1.x, t5);
+	sub_f256(p1.x, p1.x, t6);
+	sub_f256(p1.x, p1.x, t6);
+
+	// Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	sub_f256(t6, t6, p1.x);
+	mul_f256(p1.y, t4, t6);
+	mul_f256(t1, t5, t3);
+	sub_f256(p1.y, p1.y, t1);
+
+	// Compute z3 = h*z1*z2.
+	mul_f256(t1, p1.z, p2.z);
+	mul_f256(p1.z, t1, t2);
+
+	return ret;
+};
+
+// Add point P2 to point P1. This is a specialised function for the
+// case when P2 is a non-zero point in affine coordinate.
+//
+// This function computes the wrong result in the following cases:
+//
+//   - If P1 == 0
+//   - If P1 == P2
+//
+// In both cases, P1 is set to the point at infinity.
+//
+// Returned value is 0 if one of the following occurs:
+//
+//   - P1 and P2 have the same Y coordinate
+//   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
+//
+// The second case cannot actually happen with valid points, since a point
+// with Y == 0 is a point of order 2, and there is no point of order 2 on
+// curve P-256.
+//
+// Therefore, assuming that P1 != 0 on input, then the caller
+// can apply the following:
+//
+//   - If the result is not the point at infinity, then it is correct.
+//   - Otherwise, if the returned value is 1, then this is a case of
+//     P1+P2 == 0, so the result is indeed the point at infinity.
+//   - Otherwise, P1 == P2, so a "double" operation should have been
+//     performed.
+fn p256_add_mixed(p1: *p256_jacobian, p2: *p256_jacobian) u32 = {
+	// Addtions formulas are:
+	//
+	//   u1 = x1
+	//   u2 = x2 * z1^2
+	//   s1 = y1
+	//   s2 = y2 * z1^3
+	//   h = u2 - u1
+	//   r = s2 - s1
+	//   x3 = r^2 - h^3 - 2 * u1 * h^2
+	//   y3 = r * (u1 * h^2 - x3) - s1 * h^3
+	//   z3 = h * z1
+	let t1: [9]u32 = [0...];
+	let t2: [9]u32 = [0...];
+	let t3: [9]u32 = [0...];
+	let t4: [9]u32 = [0...];
+	let t5: [9]u32 = [0...];
+	let t6: [9]u32 = [0...];
+	let t7: [9]u32 = [0...];
+	let ret: u32 = 0;
+
+	// Compute u1 = x1 (in t1) and s1 = y1 (in t3).
+	t1[..] = p1.x[..];
+	t3[..] = p1.y[..];
+
+	// Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
+	square_f256(t4, p1.z);
+	mul_f256(t2, p2.x, t4);
+	mul_f256(t5, p1.z, t4);
+	mul_f256(t4, p2.y, t5);
+
+	// Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
+	// We need to test whether r is zero, so we will do some extra
+	// reduce.
+	sub_f256(t2, t2, t1);
+	sub_f256(t4, t4, t3);
+	reduce_final_f256(t4);
+	ret = 0;
+	for (let i = 0; i < 9; i += 1) {
+		ret |= t4[i];
+	};
+	ret = (ret | -ret) >> 31;
+
+	// Compute u1*h^2 (in t6) and h^3 (in t5);
+	square_f256(t7, t2);
+	mul_f256(t6, t1, t7);
+	mul_f256(t5, t7, t2);
+
+	// Compute x3 = r^2 - h^3 - 2*u1*h^2.
+	square_f256(p1.x, t4);
+	sub_f256(p1.x, p1.x, t5);
+	sub_f256(p1.x, p1.x, t6);
+	sub_f256(p1.x, p1.x, t6);
+
+	// Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
+	sub_f256(t6, t6, p1.x);
+	mul_f256(p1.y, t4, t6);
+	mul_f256(t1, t5, t3);
+	sub_f256(p1.y, p1.y, t1);
+
+	// Compute z3 = h*z1*z2.
+	mul_f256(p1.z, p1.z, t2);
+
+	return ret;
+};
+
+// Decode a P-256 point. This function does not support the point at
+// infinity. Returned value is 0 if the point is invalid, 1 otherwise.
+fn p256_decode(p: *p256_jacobian, src: const []u8) u32 = {
+	let tx: [9]u32 = [0...];
+	let ty: [9]u32 = [0...];
+	let t1: [9]u32 = [0...];
+	let t2: [9]u32 = [0...];
+	let bad: u32 = 0;
+
+	if (len(src) != 65) {
+		return 0;
+	};
+	let buf = src;
+
+	// First byte must be 0x04 (uncompressed format). We could support
+	// "hybrid format" (first byte is 0x06 or 0x07, and encodes the
+	// least significant bit of the Y coordinate), but it is explicitly
+	// forbidden by RFC 5480 (section 2.2).
+	bad = nequ32(buf[0], 0x04);
+
+	// Decode the coordinates, and check that they are both lower
+	// than the modulus.
+	tx[8] = be8tole30(tx, buf[1..33]);
+	ty[8] = be8tole30(ty, buf[33..]);
+	bad |= reduce_final_f256(tx);
+	bad |= reduce_final_f256(ty);
+
+	// Check curve equation.
+	square_f256(t1, tx);
+	mul_f256(t1, tx, t1);
+	square_f256(t2, ty);
+	sub_f256(t1, t1, tx);
+	sub_f256(t1, t1, tx);
+	sub_f256(t1, t1, tx);
+	add_f256(t1, t1, P256_B);
+	sub_f256(t1, t1, t2);
+	reduce_final_f256(t1);
+	for (let i = 0; i < 9; i += 1) {
+		bad |= t1[i];
+	};
+
+	// Copy coordinates to the point structure.
+	p.x[..] = tx[..];
+	p.y[..] = ty[..];
+	p.z[..] = [0...];
+	p.z[0] = 1;
+	return equ32(bad, 0);
+};
+
+// Encode a point into a buffer. This function assumes that the point is
+// valid, in affine coordinates, and not the point at infinity.
+fn p256_encode(dest: []u8, p: *p256_jacobian) void = {
+	dest[0] = 0x04;
+	le30tobe8(dest[1..33], p.x);
+	le30tobe8(dest[33..], p.y);
+};
+
+// Multiply a curve point by an integer. The integer is assumed to be
+// lower than the curve order, and the base point must not be the point
+// at infinity.
+fn p256_mul(p: *p256_jacobian, x: []u8) void = {
+	// qz is a flag that is initially 1, and remains equal to 1
+	// as long as the point is the point at infinity.
+	//
+	// We use a 2-bit window to handle multiplier bits by pairs.
+	// The precomputed window really is the points P2 and P3.
+	let qz: u32 = 1;
+	let p2 = p256_jacobian { ... };
+	let p3 = p256_jacobian { ... };
+	let q = p256_jacobian { ... };
+	let t = p256_jacobian { ... };
+	let u = p256_jacobian { ... };
+	let xpos: size = 0;
+
+	// Compute window values.
+	p2 = *p;
+	p256_double(&p2);
+	p3 = *p;
+	p256_add(&p3, &p2);
+
+	// We start with Q = 0. We process multiplier bits 2 by 2.
+	for (let i = len(x); i > 0; i -= 1) {
+		for (let k = 6i8; k >= 0; k -= 2) {
+			let bits: u32 = 0;
+			let bnz: u32 = 0;
+
+			p256_double(&q);
+			p256_double(&q);
+			t = *p;
+			u = q;
+			bits = (x[xpos] >> k: u8) & 3u32;
+			bnz = nequ32(bits, 0);
+			jaccopy(equ32(bits, 2), &t, &p2);
+			jaccopy(equ32(bits, 3), &t, &p3);
+			p256_add(&u, &t);
+			jaccopy(bnz & qz, &q, &t);
+			jaccopy(bnz & ~qz, &q, &u);
+			qz &= ~bnz;
+		};
+		xpos += 1;
+	};
+	*p = q;
+};
+
+fn jaccopy(ctl: u32, dest: *p256_jacobian, src: *p256_jacobian) void = {
+	ccopyu32(ctl, dest.x, src.x);
+	ccopyu32(ctl, dest.y, src.y);
+	ccopyu32(ctl, dest.z, src.z);
+};
+
+// Precomputed window: k*G points, where G is the curve generator, and k
+// is an integer from 1 to 15 (inclusive). The X and Y coordinates of
+// the point are encoded as 9 words of 30 bits each (little-endian
+// order).
+const gwin: [15][18]u32 = [
+	[
+		0x1898c296, 0x1284e517, 0x1eb33a0f, 0x00df604b, 0x2440f277,
+		0x339b958e, 0x04247f8b, 0x347cb84b, 0x00006b17, 0x37bf51f5,
+		0x2ed901a0, 0x3315ecec, 0x338cd5da, 0x0f9e162b, 0x1fad29f0,
+		0x27f9b8ee, 0x10b8bf86, 0x00004fe3,
+	],
+	[
+		0x07669978, 0x182d23f1, 0x3f21b35a, 0x225a789d, 0x351ac3c0,
+		0x08e00c12, 0x34f7e8a5, 0x1ec62340, 0x00007cf2, 0x227873d1,
+		0x3812de74, 0x0e982299, 0x1f6b798f, 0x3430dbba, 0x366b1a7d,
+		0x2d040293, 0x154436e3, 0x00000777,
+	],
+	[
+		0x06e7fd6c, 0x2d05986f, 0x3ada985f, 0x31adc87b, 0x0bf165e6,
+		0x1fbe5475, 0x30a44c8f, 0x3934698c, 0x00005ecb, 0x227d5032,
+		0x29e6c49e, 0x04fb83d9, 0x0aac0d8e, 0x24a2ecd8, 0x2c1b3869,
+		0x0ff7e374, 0x19031266, 0x00008734,
+	],
+	[
+		0x2b030852, 0x024c0911, 0x05596ef5, 0x07f8b6de, 0x262bd003,
+		0x3779967b, 0x08fbba02, 0x128d4cb4, 0x0000e253, 0x184ed8c6,
+		0x310b08fc, 0x30ee0055, 0x3f25b0fc, 0x062d764e, 0x3fb97f6a,
+		0x33cc719d, 0x15d69318, 0x0000e0f1,
+	],
+	[
+		0x03d033ed, 0x05552837, 0x35be5242, 0x2320bf47, 0x268fdfef,
+		0x13215821, 0x140d2d78, 0x02de9454, 0x00005159, 0x3da16da4,
+		0x0742ed13, 0x0d80888d, 0x004bc035, 0x0a79260d, 0x06fcdafe,
+		0x2727d8ae, 0x1f6a2412, 0x0000e0c1,
+	],
+	[
+		0x3c2291a9, 0x1ac2aba4, 0x3b215b4c, 0x131d037a, 0x17dde302,
+		0x0c90b2e2, 0x0602c92d, 0x05ca9da9, 0x0000b01a, 0x0fc77fe2,
+		0x35f1214e, 0x07e16bdf, 0x003ddc07, 0x2703791c, 0x3038b7ee,
+		0x3dad56fe, 0x041d0c8d, 0x0000e85c,
+	],
+	[
+		0x3187b2a3, 0x0018a1c0, 0x00fef5b3, 0x3e7e2e2a, 0x01fb607e,
+		0x2cc199f0, 0x37b4625b, 0x0edbe82f, 0x00008e53, 0x01f400b4,
+		0x15786a1b, 0x3041b21c, 0x31cd8cf2, 0x35900053, 0x1a7e0e9b,
+		0x318366d0, 0x076f780c, 0x000073eb,
+	],
+	[
+		0x1b6fb393, 0x13767707, 0x3ce97dbb, 0x348e2603, 0x354cadc1,
+		0x09d0b4ea, 0x1b053404, 0x1de76fba, 0x000062d9, 0x0f09957e,
+		0x295029a8, 0x3e76a78d, 0x3b547dae, 0x27cee0a2, 0x0575dc45,
+		0x1d8244ff, 0x332f647a, 0x0000ad5a,
+	],
+	[
+		0x10949ee0, 0x1e7a292e, 0x06df8b3d, 0x02b2e30b, 0x31f8729e,
+		0x24e35475, 0x30b71878, 0x35edbfb7, 0x0000ea68, 0x0dd048fa,
+		0x21688929, 0x0de823fe, 0x1c53faa9, 0x0ea0c84d, 0x052a592a,
+		0x1fce7870, 0x11325cb2, 0x00002a27,
+	],
+	[
+		0x04c5723f, 0x30d81a50, 0x048306e4, 0x329b11c7, 0x223fb545,
+		0x085347a8, 0x2993e591, 0x1b5aca8e, 0x0000cef6, 0x04af0773,
+		0x28d2eea9, 0x2751eeec, 0x037b4a7f, 0x3b4c1059, 0x08f37674,
+		0x2ae906e1, 0x18a88a6a, 0x00008786,
+	],
+	[
+		0x34bc21d1, 0x0cce474d, 0x15048bf4, 0x1d0bb409, 0x021cda16,
+		0x20de76c3, 0x34c59063, 0x04ede20e, 0x00003ed1, 0x282a3740,
+		0x0be3bbf3, 0x29889dae, 0x03413697, 0x34c68a09, 0x210ebe93,
+		0x0c8a224c, 0x0826b331, 0x00009099,
+	],
+	[
+		0x0624e3c4, 0x140317ba, 0x2f82c99d, 0x260c0a2c, 0x25d55179,
+		0x194dcc83, 0x3d95e462, 0x356f6a05, 0x0000741d, 0x0d4481d3,
+		0x2657fc8b, 0x1ba5ca71, 0x3ae44b0d, 0x07b1548e, 0x0e0d5522,
+		0x05fdc567, 0x2d1aa70e, 0x00000770,
+	],
+	[
+		0x06072c01, 0x23857675, 0x1ead58a9, 0x0b8a12d9, 0x1ee2fc79,
+		0x0177cb61, 0x0495a618, 0x20deb82b, 0x0000177c, 0x2fc7bfd8,
+		0x310eef8b, 0x1fb4df39, 0x3b8530e8, 0x0f4e7226, 0x0246b6d0,
+		0x2a558a24, 0x163353af, 0x000063bb,
+	],
+	[
+		0x24d2920b, 0x1c249dcc, 0x2069c5e5, 0x09ab2f9e, 0x36df3cf1,
+		0x1991fd0c, 0x062b97a7, 0x1e80070e, 0x000054e7, 0x20d0b375,
+		0x2e9f20bd, 0x35090081, 0x1c7a9ddc, 0x22e7c371, 0x087e3016,
+		0x03175421, 0x3c6eca7d, 0x0000f599,
+	],
+	[
+		0x259b9d5f, 0x0d9a318f, 0x23a0ef16, 0x00ebe4b7, 0x088265ae,
+		0x2cde2666, 0x2bae7adf, 0x1371a5c6, 0x0000f045, 0x0d034f36,
+		0x1f967378, 0x1b5fa3f4, 0x0ec8739d, 0x1643e62a, 0x1653947e,
+		0x22d1f4e6, 0x0fb8d64b, 0x0000b5b9,
+	],
+];
+
+// Lookup one of the Gwin[] values, by index. This is constant-time.
+fn lookup_gwin(t: *p256_jacobian, idx: u32) void = {
+	let xy: [18]u32 = [0...];
+	let k: u32 = 0;
+	let u: size = 0;
+
+	for (let k = 0u32; k < 15; k += 1) {
+		let m = -equ32(idx, k + 1);
+		for (let u = 0z; u < 18; u += 1) {
+			xy[u] |= m & gwin[k][u];
+		};
+	};
+	t.x[..] = xy[..9];
+	t.y[..] = xy[9..];
+	t.z[..] = [0...];
+	t.z[0] = 1;
+};
+
+// Multiply the generator by an integer. The integer is assumed non-zero
+// and lower than the curve order.
+fn p256_mulgen(p :*p256_jacobian, x: []u8) void = {
+	// qz is a flag that is initially 1, and remains equal to 1
+	// as long as the point is the point at infinity.
+	//
+	// We use a 4-bit window to handle multiplier bits by groups
+	// of 4. The precomputed window is constant static data, with
+	// points in affine coordinates; we use a constant-time lookup.
+	let q = p256_jacobian { ... };
+	let qz: u32 = 1;
+	let xpos: size = 0;
+
+	for (let i = len(x); i > 0; i -= 1) {
+		let bx = x[xpos]: u32;
+		xpos += 1;
+
+		for (let k = 0z; k < 2; k += 1) {
+			let bits: u32 = 0;
+			let bnz: u32 = 0;
+			let t = p256_jacobian { ... };
+			let u = p256_jacobian { ... };
+
+			p256_double(&q);
+			p256_double(&q);
+			p256_double(&q);
+			p256_double(&q);
+			bits = (bx >> 4) & 0x0f;
+			bnz = nequ32(bits, 0);
+			lookup_gwin(&t, bits);
+			u = q;
+			p256_add_mixed(&u, &t);
+			jaccopy(bnz & qz, &q, &t);
+			jaccopy(bnz & ~qz, &q, &u);
+			qz &= ~bnz;
+			bx <<= 4;
+		};
+	};
+	*p = q;
+};
+
+const P256_G: [_]u8 = [
+	0x04, 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6,
+	0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33,
+	0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96, 0x4f, 0xe3, 0x42,
+	0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e,
+	0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40,
+	0x68, 0x37, 0xbf, 0x51, 0xf5
+];
+
+const P256_N: [_]u8 = [
+	0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xbc, 0xe6, 0xfa, 0xad, 0xa7, 0x17, 0x9e, 0x84,
+	0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51
+];
+
+fn api256_mul(g: []u8, x: []u8) u32 = {
+	if (len(g) != 65) {
+		return 0;
+	};
+
+	let p = p256_jacobian { ... };
+	let r = p256_decode(&p, g);
+
+	p256_mul(&p, x);
+	p256_to_affine(&p);
+	p256_encode(g, &p);
+	return r;
+};
+
+fn api256_mulgen(r: []u8, x: []u8) size = {
+	let p = p256_jacobian { ... };
+
+	p256_mulgen(&p, x);
+	p256_to_affine(&p);
+	p256_encode(r[..65], &p);
+	return 65;
+};
+
+fn api256_muladd(a: []u8, b: []u8, x: []u8, y: []u8) u32 ={
+	let p = p256_jacobian { ... };
+	let q = p256_jacobian { ... };
+
+	if (len(a) != 65) {
+		return 0;
+	};
+
+	let r = p256_decode(&p, a);
+	p256_mul(&p, x);
+	if (len(b) == 0) {
+		p256_mulgen(&q, y);
+	} else {
+		r &= p256_decode(&q, b);
+		p256_mul(&q, y);
+	};
+
+	// The final addition may fail in case both points are equal.
+	let t = p256_add(&p, &q);
+	reduce_final_f256(p.z);
+	let z: u32 = 0;
+	for (let i = 0z; i < 9; i += 1) {
+		z |= p.z[i];
+	};
+	z = equ32(z, 0);
+	p256_double(&q);
+
+	// If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
+	// have the following:
+	//
+	//   z = 0, t = 0   return P (normal addition)
+	//   z = 0, t = 1   return P (normal addition)
+	//   z = 1, t = 0   return Q (a 'double' case)
+	//   z = 1, t = 1   report an error (P+Q = 0)
+	jaccopy(z & ~t, &p, &q);
+	p256_to_affine(&p);
+	p256_encode(a, &p);
+	r &= ~(z & t);
+	return r;
+};
+
+fn api256_order() const []u8 = P256_N;
+fn api256_generator() const []u8 = P256_G;
+
+const _p256: curve = curve {
+	pointsz = P256_POINTSZ,
+	order = &api256_order,
+	generator = &api256_generator,
+	mul = &api256_mul,
+	mulgen = &api256_mulgen,
+	muladd = &api256_muladd,
+	keygen = &mask_keygen,
+};
+
+// Size of a [[p256]] point in bytes.
+export def P256_POINTSZ = len(P256_G);
+
+// Size of a [[p256]] scalar in bytes.
+export def P256_SCALARSZ = len(P256_N);
+
+// A [[curve]] implementation of P-256, also known as secp256r1 or prime256v1.
+export const p256: *curve = &_p256;

	hare [hare] The Hare programming language
	git clone https://git.torresjrjr.com/hare.git
	Log \| Files \| Refs \| README \| LICENSE

A	crypto/ec/curves+test.ha	\|	453	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	crypto/ec/genkeys+test.ha	\|	34	++++++++++++++++++++++++++++++++++
A	crypto/ec/p256.ha	\|	1252	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++