hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

p256.ha (35914B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 // Ported from BearSSL
      5 //
      6 // Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
      7 //
      8 // Permission is hereby granted, free of charge, to any person obtaining
      9 // a copy of this software and associated documentation files (the
     10 // "Software"), to deal in the Software without restriction, including
     11 // without limitation the rights to use, copy, modify, merge, publish,
     12 // distribute, sublicense, and/or sell copies of the Software, and to
     13 // permit persons to whom the Software is furnished to do so, subject to
     14 // the following conditions:
     15 //
     16 // The above copyright notice and this permission notice shall be
     17 // included in all copies or substantial portions of the Software.
     18 //
     19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
     20 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
     21 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
     22 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
     23 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
     24 // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
     25 // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
     26 // SOFTWARE.
     27 
     28 use bytes;
     29 use crypto::math::*;
     30 
     31 
     32 // Note from the BearSSL documentation:
     33 //
     34 // The ec_p256_m31 implementation supports P-256 with specialised code,
     35 // including modular reduction routines that leverage the special format of the
     36 // field modulus, and internal split of data as sequences of 30-bit words, which
     37 // helps with carry propagation. ec_p256_m31 also includes fixed point
     38 // optimisations, for the common case of multiplying the conventional generator
     39 // point. These implementations are faster than the generic "i31" code, but with
     40 // a larger code footprint.
     41 //
     42 // Convert an integer from unsigned big-endian encoding to a sequence of 30-bit
     43 // words in little-endian order. The final "partial" word is returned.
     44 fn be8tole30(dest: []u32, src: []u8) u32 = {
     45 	let acc: u32 = 0;
     46 	let acclen: u32 = 0;
     47 	let destpos = 0;
     48 
     49 	for (let i = len(src); i > 0; i -= 1) {
     50 		let b = src[i - 1]: u32;
     51 		if (acclen < 22) {
     52 			acc |= b << acclen;
     53 			acclen += 8;
     54 		} else {
     55 			dest[destpos] = (acc | (b << acclen)) & 0x3FFFFFFF;
     56 			destpos += 1;
     57 			acc = b >> (30 - acclen);
     58 			acclen -= 22;
     59 		};
     60 	};
     61 	return acc;
     62 };
     63 
     64 // Convert an integer (30-bit words, little-endian) to unsigned
     65 // big-endian encoding. The total encoding length is provided; all
     66 // the destination bytes will be filled.
     67 fn le30tobe8(dest: []u8, src: []u32) void = {
     68 	let acc: u32 = 0;
     69 	let acclen: u32 = 0;
     70 	let srcpos: size = 0;
     71 
     72 	for (let i = len(dest); i > 0; i -= 1) {
     73 		if (acclen < 8) {
     74 			let w = src[srcpos];
     75 			srcpos += 1;
     76 
     77 			dest[i - 1] = (acc | (w << acclen)): u8;
     78 			acc = w >> (8 - acclen);
     79 			acclen += 22;
     80 		} else {
     81 			dest[i - 1] = acc: u8;
     82 			acc >>= 8;
     83 			acclen -= 8;
     84 		};
     85 	};
     86 };
     87 
     88 @test fn be8tole30() void = {
     89 	let be8: [6]u8 = [0x11, 0x22, 0xF3, 0x44, 0x55, 0x66];
     90 	let le30result: [2]u32 = [0...];
     91 	let be8result: [6]u8 = [0...];
     92 
     93 	le30result[1] = be8tole30(le30result, be8);
     94 	le30tobe8(be8result, le30result);
     95 
     96 	assert(bytes::equal(be8, be8result));
     97 };
     98 
     99 fn arsh(x: u32, n: u32) u32 = (x: i32 >> n: i32): u32;
    100 fn arshw(x: u64, n: u32) u64 = (x: i64 >> n: i32): u64;
    101 
    102 @test fn arsh() void = assert(arsh(0x80000000u32, 2) == 0xe0000000);
    103 
    104 // Multiply two integers. Source integers are represented as arrays of
    105 // nine 30-bit words, for values up to 2^270-1. Result is encoded over
    106 // 18 words of 30 bits each.
    107 fn mul9(d: []u32, a: []u32, b: []u32) void = {
    108 	// Maximum intermediate result is no more than
    109 	// 10376293531797946367, which fits in 64 bits. Reason:
    110 	//
    111 	//   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
    112 	//   10376293531797946367 < 9663676407 * 2^30
    113 	//
    114 	// Thus, adding together 9 products of 30-bit integers, with
    115 	// a carry of at most 9663676406, yields an integer that fits
    116 	// on 64 bits and generates a carry of at most 9663676406.
    117 	let t: [17]u64 = [0...];
    118 
    119 	t[0] = mulu32(a[0], b[0]);
    120 	t[1] = mulu32(a[0], b[1])
    121 		+ mulu32(a[1], b[0]);
    122 	t[2] = mulu32(a[0], b[2])
    123 		+ mulu32(a[1], b[1])
    124 		+ mulu32(a[2], b[0]);
    125 	t[3] = mulu32(a[0], b[3])
    126 		+ mulu32(a[1], b[2])
    127 		+ mulu32(a[2], b[1])
    128 		+ mulu32(a[3], b[0]);
    129 	t[4] = mulu32(a[0], b[4])
    130 		+ mulu32(a[1], b[3])
    131 		+ mulu32(a[2], b[2])
    132 		+ mulu32(a[3], b[1])
    133 		+ mulu32(a[4], b[0]);
    134 	t[5] = mulu32(a[0], b[5])
    135 		+ mulu32(a[1], b[4])
    136 		+ mulu32(a[2], b[3])
    137 		+ mulu32(a[3], b[2])
    138 		+ mulu32(a[4], b[1])
    139 		+ mulu32(a[5], b[0]);
    140 	t[6] = mulu32(a[0], b[6])
    141 		+ mulu32(a[1], b[5])
    142 		+ mulu32(a[2], b[4])
    143 		+ mulu32(a[3], b[3])
    144 		+ mulu32(a[4], b[2])
    145 		+ mulu32(a[5], b[1])
    146 		+ mulu32(a[6], b[0]);
    147 	t[7] = mulu32(a[0], b[7])
    148 		+ mulu32(a[1], b[6])
    149 		+ mulu32(a[2], b[5])
    150 		+ mulu32(a[3], b[4])
    151 		+ mulu32(a[4], b[3])
    152 		+ mulu32(a[5], b[2])
    153 		+ mulu32(a[6], b[1])
    154 		+ mulu32(a[7], b[0]);
    155 	t[8] = mulu32(a[0], b[8])
    156 		+ mulu32(a[1], b[7])
    157 		+ mulu32(a[2], b[6])
    158 		+ mulu32(a[3], b[5])
    159 		+ mulu32(a[4], b[4])
    160 		+ mulu32(a[5], b[3])
    161 		+ mulu32(a[6], b[2])
    162 		+ mulu32(a[7], b[1])
    163 		+ mulu32(a[8], b[0]);
    164 	t[9] = mulu32(a[1], b[8])
    165 		+ mulu32(a[2], b[7])
    166 		+ mulu32(a[3], b[6])
    167 		+ mulu32(a[4], b[5])
    168 		+ mulu32(a[5], b[4])
    169 		+ mulu32(a[6], b[3])
    170 		+ mulu32(a[7], b[2])
    171 		+ mulu32(a[8], b[1]);
    172 	t[10] = mulu32(a[2], b[8])
    173 		+ mulu32(a[3], b[7])
    174 		+ mulu32(a[4], b[6])
    175 		+ mulu32(a[5], b[5])
    176 		+ mulu32(a[6], b[4])
    177 		+ mulu32(a[7], b[3])
    178 		+ mulu32(a[8], b[2]);
    179 	t[11] = mulu32(a[3], b[8])
    180 		+ mulu32(a[4], b[7])
    181 		+ mulu32(a[5], b[6])
    182 		+ mulu32(a[6], b[5])
    183 		+ mulu32(a[7], b[4])
    184 		+ mulu32(a[8], b[3]);
    185 	t[12] = mulu32(a[4], b[8])
    186 		+ mulu32(a[5], b[7])
    187 		+ mulu32(a[6], b[6])
    188 		+ mulu32(a[7], b[5])
    189 		+ mulu32(a[8], b[4]);
    190 	t[13] = mulu32(a[5], b[8])
    191 		+ mulu32(a[6], b[7])
    192 		+ mulu32(a[7], b[6])
    193 		+ mulu32(a[8], b[5]);
    194 	t[14] = mulu32(a[6], b[8])
    195 		+ mulu32(a[7], b[7])
    196 		+ mulu32(a[8], b[6]);
    197 	t[15] = mulu32(a[7], b[8])
    198 		+ mulu32(a[8], b[7]);
    199 	t[16] = mulu32(a[8], b[8]);
    200 
    201 	// Propagate carries.
    202 	let cc: u64 = 0;
    203 	for (let i = 0z; i < 17; i += 1) {
    204 		let w = t[i] + cc;
    205 		d[i] = w: u32 & 0x3FFFFFFF;
    206 		cc = w >> 30;
    207 	};
    208 	d[17] = cc: u32;
    209 };
    210 
    211 // Square a 270-bit integer, represented as an array of nine 30-bit words.
    212 // Result uses 18 words of 30 bits each.
    213 fn square9(d: []u32, a: []u32) void = {
    214 	let t: [17]u64 = [0...];
    215 
    216 	t[0] = mulu32(a[0], a[0]);
    217 	t[1] = ((mulu32(a[0], a[1])) << 1);
    218 	t[2] = mulu32(a[1], a[1])
    219 		+ ((mulu32(a[0], a[2])) << 1);
    220 	t[3] = ((mulu32(a[0], a[3])
    221 		+ mulu32(a[1], a[2])) << 1);
    222 	t[4] = mulu32(a[2], a[2])
    223 		+ ((mulu32(a[0], a[4])
    224 		+ mulu32(a[1], a[3])) << 1);
    225 	t[5] = ((mulu32(a[0], a[5])
    226 		+ mulu32(a[1], a[4])
    227 		+ mulu32(a[2], a[3])) << 1);
    228 	t[6] = mulu32(a[3], a[3])
    229 		+ ((mulu32(a[0], a[6])
    230 		+ mulu32(a[1], a[5])
    231 		+ mulu32(a[2], a[4])) << 1);
    232 	t[7] = ((mulu32(a[0], a[7])
    233 		+ mulu32(a[1], a[6])
    234 		+ mulu32(a[2], a[5])
    235 		+ mulu32(a[3], a[4])) << 1);
    236 	t[8] = mulu32(a[4], a[4])
    237 		+ ((mulu32(a[0], a[8])
    238 		+ mulu32(a[1], a[7])
    239 		+ mulu32(a[2], a[6])
    240 		+ mulu32(a[3], a[5])) << 1);
    241 	t[9] = ((mulu32(a[1], a[8])
    242 		+ mulu32(a[2], a[7])
    243 		+ mulu32(a[3], a[6])
    244 		+ mulu32(a[4], a[5])) << 1);
    245 	t[10] = mulu32(a[5], a[5])
    246 		+ ((mulu32(a[2], a[8])
    247 		+ mulu32(a[3], a[7])
    248 		+ mulu32(a[4], a[6])) << 1);
    249 	t[11] = ((mulu32(a[3], a[8])
    250 		+ mulu32(a[4], a[7])
    251 		+ mulu32(a[5], a[6])) << 1);
    252 	t[12] = mulu32(a[6], a[6])
    253 		+ ((mulu32(a[4], a[8])
    254 		+ mulu32(a[5], a[7])) << 1);
    255 	t[13] = ((mulu32(a[5], a[8])
    256 		+ mulu32(a[6], a[7])) << 1);
    257 	t[14] = mulu32(a[7], a[7])
    258 		+ ((mulu32(a[6], a[8])) << 1);
    259 	t[15] = ((mulu32(a[7], a[8])) << 1);
    260 	t[16] = mulu32(a[8], a[8]);
    261 
    262 	// Propagate carries.
    263 	let cc: u64 = 0;
    264 	for (let i = 0z; i < 17; i += 1) {
    265 		let w = t[i] + cc;
    266 		d[i] = w: u32 & 0x3FFFFFFF;
    267 		cc = w >> 30;
    268 	};
    269 	d[17] = cc: u32;
    270 };
    271 
    272 // Base field modulus for P-256.
    273 const F256: [_]u32 = [
    274 	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
    275 	0x00001000, 0x3FFFC000, 0x0000FFFF
    276 ];
    277 
    278 // The 'b' curve equation coefficient for P-256.
    279 const P256_B: [_]u32 = [
    280 	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65, 0x2EF555DA,
    281 	0x293E7B3E, 0x0D762A8E, 0x00005AC6
    282 ];
    283 
    284 // Addition in the field. Source operands shall fit on 257 bits; output
    285 // will be lower than twice the modulus.
    286 fn add_f256(d: []u32, a: []u32, b: []u32) void = {
    287 	let w: u32 = 0;
    288 
    289 	let cc: u32 = 0;
    290 	for (let i = 0z; i < 9; i += 1) {
    291 		w = a[i] + b[i] + cc;
    292 		d[i] = w & 0x3FFFFFFF;
    293 		cc = w >> 30;
    294 	};
    295 	w >>= 16;
    296 	d[8] &= 0xFFFF;
    297 	d[3] -= w << 6;
    298 	d[6] -= w << 12;
    299 	d[7] += w << 14;
    300 	cc = w;
    301 	for (let i = 0z; i < 9; i += 1) {
    302 		w = d[i] + cc;
    303 		d[i] = w & 0x3FFFFFFF;
    304 		cc = arsh(w, 30);
    305 	};
    306 };
    307 
    308 // Subtraction in the field. Source operands shall be smaller than twice
    309 // the modulus; the result will fulfil the same property.
    310 fn sub_f256(d: []u32, a: []u32, b: []u32) void = {
    311 	let w: u32 = 0;
    312 	let cc: u32 = 0;
    313 
    314 	// We really compute a - b + 2*p to make sure that the result is
    315 	// positive.
    316 	w = a[0] - b[0] - 0x00002;
    317 	d[0] = w & 0x3FFFFFFF;
    318 	w = a[1] - b[1] + arsh(w, 30);
    319 	d[1] = w & 0x3FFFFFFF;
    320 	w = a[2] - b[2] + arsh(w, 30);
    321 	d[2] = w & 0x3FFFFFFF;
    322 	w = a[3] - b[3] + arsh(w, 30) + 0x00080;
    323 	d[3] = w & 0x3FFFFFFF;
    324 	w = a[4] - b[4] + arsh(w, 30);
    325 	d[4] = w & 0x3FFFFFFF;
    326 	w = a[5] - b[5] + arsh(w, 30);
    327 	d[5] = w & 0x3FFFFFFF;
    328 	w = a[6] - b[6] + arsh(w, 30) + 0x02000;
    329 	d[6] = w & 0x3FFFFFFF;
    330 	w = a[7] - b[7] + arsh(w, 30) - 0x08000;
    331 	d[7] = w & 0x3FFFFFFF;
    332 	w = a[8] - b[8] + arsh(w, 30) + 0x20000;
    333 	d[8] = w & 0xFFFF;
    334 	w >>= 16;
    335 	d[8] &= 0xFFFF;
    336 	d[3] -= w << 6;
    337 	d[6] -= w << 12;
    338 	d[7] += w << 14;
    339 	cc = w;
    340 	for (let i = 0z; i < 9; i += 1) {
    341 		w = d[i] + cc;
    342 		d[i] = w & 0x3FFFFFFF;
    343 		cc = arsh(w, 30);
    344 	};
    345 };
    346 
    347 // Compute a multiplication in F256. Source operands shall be less than
    348 // twice the modulus.
    349 fn mul_f256(d: []u32, a: []u32, b: []u32) void = {
    350 	let t: [18]u32 = [0...];
    351 	let s: [18]u64 = [0...];
    352 	let x: u64 = 0;
    353 
    354 	mul9(t, a, b);
    355 
    356 	// Modular reduction: each high word in added/subtracted where
    357 	// necessary.
    358 	//
    359 	// The modulus is:
    360 	//    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
    361 	// Therefore:
    362 	//    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
    363 	//
    364 	// For a word x at bit offset n (n >= 256), we have:
    365 	//    x*2^n = x*2^(n-32) - x*2^(n-64)
    366 	//	    - x*2^(n - 160) + x*2^(n-256) mod p
    367 	//
    368 	// Thus, we can nullify the high word if we reinject it at some
    369 	// proper emplacements.
    370 	//
    371 	// We use 64-bit intermediate words to allow for carries to
    372 	// accumulate easily, before performing the final propagation.
    373 	for (let i = 0; i < 18; i += 1) {
    374 		s[i] = t[i];
    375 	};
    376 
    377 	for (let i = 17; i >= 9; i -= 1) {
    378 		let y = s[i];
    379 		s[i - 1] += arshw(y, 2);
    380 		s[i - 2] += (y << 28) & 0x3FFFFFFF;
    381 		s[i - 2] -= arshw(y, 4);
    382 		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
    383 		s[i - 5] -= arshw(y, 10);
    384 		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
    385 		s[i - 8] += arshw(y, 16);
    386 		s[i - 9] += (y << 14) & 0x3FFFFFFF;
    387 	};
    388 
    389 	// Carry propagation must be signed. Moreover, we may have overdone
    390 	// it a bit, and obtain a negative result.
    391 	//
    392 	// The loop above ran 9 times; each time, each word was augmented
    393 	// by at most one extra word (in absolute value). Thus, the top
    394 	// word must in fine fit in 39 bits, so the carry below will fit
    395 	// on 9 bits.
    396 	let cc: u64 = 0;
    397 	for (let i = 0z; i < 9; i += 1) {
    398 		x = s[i] + cc;
    399 		d[i] = x: u32 & 0x3FFFFFFF;
    400 		cc = arshw(x, 30);
    401 	};
    402 
    403 	// All nine words fit on 30 bits, but there may be an extra
    404 	// carry for a few bits (at most 9), and that carry may be
    405 	// negative. Moreover, we want the result to fit on 257 bits.
    406 	// The two lines below ensure that the word in d[] has length
    407 	// 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
    408 	// significant length of cc is less than 24 bits, so we will be
    409 	// able to switch to 32-bit operations.
    410 	cc = arshw(x, 16);
    411 	d[8] &= 0xFFFF;
    412 
    413 	// One extra round of reduction, for cc*2^256, which means
    414 	// adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
    415 	// value. If cc is negative, then it may happen (rarely, but
    416 	// not neglectibly so) that the result would be negative. In
    417 	// order to avoid that, if cc is negative, then we add the
    418 	// modulus once. Note that if cc is negative, then propagating
    419 	// that carry must yield a value lower than the modulus, so
    420 	// adding the modulus once will keep the final result under
    421 	// twice the modulus.
    422 	let z = cc: u32;
    423 	d[3] -= z << 6;
    424 	d[6] -= (z << 12) & 0x3FFFFFFF;
    425 	d[7] -= arsh(z, 18);
    426 	d[7] += (z << 14) & 0x3FFFFFFF;
    427 	d[8] += arsh(z, 16);
    428 	let c = z >> 31;
    429 	d[0] -= c;
    430 	d[3] += c << 6;
    431 	d[6] += c << 12;
    432 	d[7] -= c << 14;
    433 	d[8] += c << 16;
    434 
    435 	for (let i = 0z; i < 9; i += 1) {
    436 		let w = d[i] + z;
    437 		d[i] = w & 0x3FFFFFFF;
    438 		z = arsh(w, 30);
    439 	};
    440 };
    441 
    442 
    443 // Compute a square in F256. Source operand shall be less than
    444 // twice the modulus.
    445 fn square_f256(d: []u32, a: []u32) void = {
    446 	let t: [18]u32 = [0...];
    447 	let s: [18]u64 = [0...];
    448 
    449 	square9(t, a);
    450 
    451 	// Modular reduction: each high word in added/subtracted where
    452 	// necessary.
    453 	//
    454 	// The modulus is:
    455 	//    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
    456 	// Therefore:
    457 	//    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
    458 	//
    459 	// For a word x at bit offset n (n >= 256), we have:
    460 	//    x*2^n = x*2^(n-32) - x*2^(n-64)
    461 	//	    - x*2^(n - 160) + x*2^(n-256) mod p
    462 	//
    463 	// Thus, we can nullify the high word if we reinject it at some
    464 	// proper emplacements.
    465 	//
    466 	// We use 64-bit intermediate words to allow for carries to
    467 	// accumulate easily, before performing the final propagation.
    468 	for (let i = 0; i < 18; i += 1) {
    469 		s[i] = t[i];
    470 	};
    471 
    472 	for (let i = 17; i >= 9; i -= 1) {
    473 		let y = s[i];
    474 		s[i - 1] += arshw(y, 2);
    475 		s[i - 2] += (y << 28) & 0x3FFFFFFF;
    476 		s[i - 2] -= arshw(y, 4);
    477 		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
    478 		s[i - 5] -= arshw(y, 10);
    479 		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
    480 		s[i - 8] += arshw(y, 16);
    481 		s[i - 9] += (y << 14) & 0x3FFFFFFF;
    482 	};
    483 
    484 	// Carry propagation must be signed. Moreover, we may have overdone
    485 	// it a bit, and obtain a negative result.
    486 	//
    487 	// The loop above ran 9 times; each time, each word was augmented
    488 	// by at most one extra word (in absolute value). Thus, the top
    489 	// word must in fine fit in 39 bits, so the carry below will fit
    490 	// on 9 bits.
    491 	let cc: u64 = 0;
    492 	let x: u64 = 0;
    493 	for (let i = 0; i < 9; i += 1) {
    494 		x = s[i] + cc;
    495 		d[i] = x: u32 & 0x3FFFFFFF;
    496 		cc = arshw(x, 30);
    497 	};
    498 
    499 	// All nine words fit on 30 bits, but there may be an extra
    500 	// carry for a few bits (at most 9), and that carry may be
    501 	// negative. Moreover, we want the result to fit on 257 bits.
    502 	// The two lines below ensure that the word in d[] has length
    503 	// 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
    504 	// significant length of cc is less than 24 bits, so we will be
    505 	// able to switch to 32-bit operations.
    506 	cc = arshw(x, 16);
    507 	d[8] &= 0xFFFF;
    508 
    509 	// One extra round of reduction, for cc*2^256, which means
    510 	// adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
    511 	// value. If cc is negative, then it may happen (rarely, but
    512 	// not neglectibly so) that the result would be negative. In
    513 	// order to avoid that, if cc is negative, then we add the
    514 	// modulus once. Note that if cc is negative, then propagating
    515 	// that carry must yield a value lower than the modulus, so
    516 	// adding the modulus once will keep the final result under
    517 	// twice the modulus.
    518 	let z = cc: u32;
    519 	d[3] -= z << 6;
    520 	d[6] -= (z << 12) & 0x3FFFFFFF;
    521 	d[7] -= arsh(z, 18);
    522 	d[7] += (z << 14) & 0x3FFFFFFF;
    523 	d[8] += arsh(z, 16);
    524 	let c = z >> 31;
    525 	d[0] -= c;
    526 	d[3] += c << 6;
    527 	d[6] += c << 12;
    528 	d[7] -= c << 14;
    529 	d[8] += c << 16;
    530 
    531 	for (let i = 0z; i < 9; i += 1) {
    532 		let w = d[i] + z;
    533 		d[i] = w & 0x3FFFFFFF;
    534 		z = arsh(w, 30);
    535 	};
    536 };
    537 
    538 // Perform a "final reduction" in field F256 (field for curve P-256).
    539 // The source value must be less than twice the modulus. If the value
    540 // is not lower than the modulus, then the modulus is subtracted and
    541 // this function returns 1; otherwise, it leaves it untouched and it
    542 // returns 0.
    543 fn reduce_final_f256(d: []u32) u32 = {
    544 	let t: [9]u32 = [0...];
    545 
    546 	let cc: u32 = 0;
    547 	for (let i = 0; i < 9; i += 1) {
    548 		let w = d[i] - F256[i] - cc;
    549 		cc = w >> 31;
    550 		t[i] = w & 0x3FFFFFFF;
    551 	};
    552 	cc ^= 1;
    553 	ccopyu32(cc, d, t);
    554 	return cc;
    555 };
    556 
    557 // Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
    558 // are such that:
    559 //   X = x / z^2
    560 //   Y = y / z^3
    561 // For the point at infinity, z = 0.
    562 // Each point thus admits many possible representations.
    563 //
    564 // Coordinates are represented in arrays of 32-bit integers, each holding
    565 // 30 bits of data. Values may also be slightly greater than the modulus,
    566 // but they will always be lower than twice the modulus.
    567 type p256_jacobian = struct {
    568 	x: [9]u32,
    569 	y: [9]u32,
    570 	z: [9]u32,
    571 };
    572 
    573 // Convert a point to affine coordinates:
    574 //  - If the point is the point at infinity, then all three coordinates
    575 //    are set to 0.
    576 //  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
    577 //    coordinates are the 'X' and 'Y' affine coordinates.
    578 // The coordinates are guaranteed to be lower than the modulus.
    579 fn p256_to_affine(p: *p256_jacobian) void = {
    580 	let t1: [9]u32 = [0...];
    581 	let t2: [9]u32 = [0...];
    582 
    583 	// Invert z with a modular exponentiation: the modulus is
    584 	// p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
    585 	// p-2. Exponent bit pattern (from high to low) is:
    586 	//  - 32 bits of value 1
    587 	//  - 31 bits of value 0
    588 	//  - 1 bit of value 1
    589 	//  - 96 bits of value 0
    590 	//  - 94 bits of value 1
    591 	//  - 1 bit of value 0
    592 	//  - 1 bit of value 1
    593 	// Thus, we precompute z^(2^31-1) to speed things up.
    594 	//
    595 	// If z = 0 (point at infinity) then the modular exponentiation
    596 	// will yield 0, which leads to the expected result (all three
    597 	// coordinates set to 0).
    598 
    599 	// A simple square-and-multiply for z^(2^31-1). We could save about
    600 	// two dozen multiplications here with an addition chain, but
    601 	// this would require a bit more code, and extra stack buffers.
    602 	t1[..] = p.z[..];
    603 	for (let i = 0; i < 30; i += 1) {
    604 		square_f256(t1, t1);
    605 		mul_f256(t1, t1, p.z);
    606 	};
    607 
    608 	// Square-and-multiply. Apart from the squarings, we have a few
    609 	// multiplications to set bits to 1; we multiply by the original z
    610 	// for setting 1 bit, and by t1 for setting 31 bits.
    611 	t2[..] = p.z[..];
    612 	for (let i = 1; i < 256; i += 1) {
    613 		square_f256(t2, t2);
    614 		switch (i) {
    615 		case 31, 190, 221, 252 =>
    616 			mul_f256(t2, t2, t1);
    617 		case 63, 253, 255 =>
    618 			mul_f256(t2, t2, p.z);
    619 		case => void;
    620 		};
    621 	};
    622 
    623 	// Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
    624 	mul_f256(t1, t2, t2);
    625 	mul_f256(p.x, t1, p.x);
    626 	mul_f256(t1, t1, t2);
    627 	mul_f256(p.y, t1, p.y);
    628 	reduce_final_f256(p.x);
    629 	reduce_final_f256(p.y);
    630 
    631 	// Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
    632 	// this will set z to 1.
    633 	mul_f256(p.z, p.z, t2);
    634 	reduce_final_f256(p.z);
    635 };
    636 
    637 // Double a point in P-256. This function works for all valid points,
    638 // including the point at infinity.
    639 fn p256_double(q: *p256_jacobian) void = {
    640 	// Doubling formulas are:
    641 	//
    642 	//   s = 4*x*y^2
    643 	//   m = 3*(x + z^2)*(x - z^2)
    644 	//   x' = m^2 - 2*s
    645 	//   y' = m*(s - x') - 8*y^4
    646 	//   z' = 2*y*z
    647 	//
    648 	// These formulas work for all points, including points of order 2
    649 	// and points at infinity:
    650 	//   - If y = 0 then z' = 0. But there is no such point in P-256
    651 	//     anyway.
    652 	//   - If z = 0 then z' = 0.
    653 	let t1: [9]u32 = [0...];
    654 	let t2: [9]u32 = [0...];
    655 	let t3: [9]u32 = [0...];
    656 	let t4: [9]u32 = [0...];
    657 
    658 	// Compute z^2 in t1.
    659 	square_f256(t1, q.z);
    660 
    661 	// Compute x-z^2 in t2 and x+z^2 in t1.
    662 	add_f256(t2, q.x, t1);
    663 	sub_f256(t1, q.x, t1);
    664 
    665 	// Compute 3*(x+z^2)*(x-z^2) in t1.
    666 	mul_f256(t3, t1, t2);
    667 	add_f256(t1, t3, t3);
    668 	add_f256(t1, t3, t1);
    669 
    670 	// Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
    671 	square_f256(t3, q.y);
    672 	add_f256(t3, t3, t3);
    673 	mul_f256(t2, q.x, t3);
    674 	add_f256(t2, t2, t2);
    675 
    676 
    677 	// Compute x' = m^2 - 2*s.
    678 	square_f256(q.x, t1);
    679 	sub_f256(q.x, q.x, t2);
    680 	sub_f256(q.x, q.x, t2);
    681 
    682 	// Compute z' = 2*y*z.
    683 	mul_f256(t4, q.y, q.z);
    684 	add_f256(q.z, t4, t4);
    685 
    686 	// Compute y' = m*(s - x') - 8*y^4. Note that we already have
    687 	// 2*y^2 in t3.
    688 	sub_f256(t2, t2, q.x);
    689 	mul_f256(q.y, t1, t2);
    690 	square_f256(t4, t3);
    691 	add_f256(t4, t4, t4);
    692 	sub_f256(q.y, q.y, t4);
    693 };
    694 
    695 // Add point P2 to point P1.
    696 //
    697 // This function computes the wrong result in the following cases:
    698 //
    699 //   - If P1 == 0 but P2 != 0
    700 //   - If P1 != 0 but P2 == 0
    701 //   - If P1 == P2
    702 //
    703 // In all three cases, P1 is set to the point at infinity.
    704 //
    705 // Returned value is 0 if one of the following occurs:
    706 //
    707 //   - P1 and P2 have the same Y coordinate
    708 //   - P1 == 0 and P2 == 0
    709 //   - The Y coordinate of one of the points is 0 and the other point is
    710 //     the point at infinity.
    711 //
    712 // The third case cannot actually happen with valid points, since a point
    713 // with Y == 0 is a point of order 2, and there is no point of order 2 on
    714 // curve P-256.
    715 //
    716 // Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
    717 // can apply the following:
    718 //
    719 //   - If the result is not the point at infinity, then it is correct.
    720 //   - Otherwise, if the returned value is 1, then this is a case of
    721 //     P1+P2 == 0, so the result is indeed the point at infinity.
    722 //   - Otherwise, P1 == P2, so a "double" operation should have been
    723 //     performed.
    724 fn p256_add(p1: *p256_jacobian, p2: *p256_jacobian) u32 = {
    725 	// Addtions formulas are:
    726 	//
    727 	//   u1 = x1 * z2^2
    728 	//   u2 = x2 * z1^2
    729 	//   s1 = y1 * z2^3
    730 	//   s2 = y2 * z1^3
    731 	//   h = u2 - u1
    732 	//   r = s2 - s1
    733 	//   x3 = r^2 - h^3 - 2 * u1 * h^2
    734 	//   y3 = r * (u1 * h^2 - x3) - s1 * h^3
    735 	//   z3 = h * z1 * z2
    736 	let t1: [9]u32 = [0...];
    737 	let t2: [9]u32 = [0...];
    738 	let t3: [9]u32 = [0...];
    739 	let t4: [9]u32 = [0...];
    740 	let t5: [9]u32 = [0...];
    741 	let t6: [9]u32 = [0...];
    742 	let t7: [9]u32 = [0...];
    743 	let ret: u32 = 0;
    744 
    745 	// Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
    746 	square_f256(t3, p2.z);
    747 	mul_f256(t1, p1.x, t3);
    748 	mul_f256(t4, p2.z, t3);
    749 	mul_f256(t3, p1.y, t4);
    750 
    751 	// Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
    752 	square_f256(t4, p1.z);
    753 	mul_f256(t2, p2.x, t4);
    754 	mul_f256(t5, p1.z, t4);
    755 	mul_f256(t4, p2.y, t5);
    756 
    757 	// Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
    758 	// We need to test whether r is zero, so we will do some extra
    759 	// reduce.
    760 	sub_f256(t2, t2, t1);
    761 	sub_f256(t4, t4, t3);
    762 	reduce_final_f256(t4);
    763 	ret = 0;
    764 	for (let i = 0; i < 9; i += 1) {
    765 		ret |= t4[i];
    766 	};
    767 	ret = (ret | -ret) >> 31;
    768 
    769 	// Compute u1*h^2 (in t6) and h^3 (in t5);
    770 	square_f256(t7, t2);
    771 	mul_f256(t6, t1, t7);
    772 	mul_f256(t5, t7, t2);
    773 
    774 	// Compute x3 = r^2 - h^3 - 2*u1*h^2.
    775 	square_f256(p1.x, t4);
    776 	sub_f256(p1.x, p1.x, t5);
    777 	sub_f256(p1.x, p1.x, t6);
    778 	sub_f256(p1.x, p1.x, t6);
    779 
    780 	// Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
    781 	sub_f256(t6, t6, p1.x);
    782 	mul_f256(p1.y, t4, t6);
    783 	mul_f256(t1, t5, t3);
    784 	sub_f256(p1.y, p1.y, t1);
    785 
    786 	// Compute z3 = h*z1*z2.
    787 	mul_f256(t1, p1.z, p2.z);
    788 	mul_f256(p1.z, t1, t2);
    789 
    790 	return ret;
    791 };
    792 
    793 // Add point P2 to point P1. This is a specialised function for the
    794 // case when P2 is a non-zero point in affine coordinate.
    795 //
    796 // This function computes the wrong result in the following cases:
    797 //
    798 //   - If P1 == 0
    799 //   - If P1 == P2
    800 //
    801 // In both cases, P1 is set to the point at infinity.
    802 //
    803 // Returned value is 0 if one of the following occurs:
    804 //
    805 //   - P1 and P2 have the same Y coordinate
    806 //   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
    807 //
    808 // The second case cannot actually happen with valid points, since a point
    809 // with Y == 0 is a point of order 2, and there is no point of order 2 on
    810 // curve P-256.
    811 //
    812 // Therefore, assuming that P1 != 0 on input, then the caller
    813 // can apply the following:
    814 //
    815 //   - If the result is not the point at infinity, then it is correct.
    816 //   - Otherwise, if the returned value is 1, then this is a case of
    817 //     P1+P2 == 0, so the result is indeed the point at infinity.
    818 //   - Otherwise, P1 == P2, so a "double" operation should have been
    819 //     performed.
    820 fn p256_add_mixed(p1: *p256_jacobian, p2: *p256_jacobian) u32 = {
    821 	// Addtions formulas are:
    822 	//
    823 	//   u1 = x1
    824 	//   u2 = x2 * z1^2
    825 	//   s1 = y1
    826 	//   s2 = y2 * z1^3
    827 	//   h = u2 - u1
    828 	//   r = s2 - s1
    829 	//   x3 = r^2 - h^3 - 2 * u1 * h^2
    830 	//   y3 = r * (u1 * h^2 - x3) - s1 * h^3
    831 	//   z3 = h * z1
    832 	let t1: [9]u32 = [0...];
    833 	let t2: [9]u32 = [0...];
    834 	let t3: [9]u32 = [0...];
    835 	let t4: [9]u32 = [0...];
    836 	let t5: [9]u32 = [0...];
    837 	let t6: [9]u32 = [0...];
    838 	let t7: [9]u32 = [0...];
    839 	let ret: u32 = 0;
    840 
    841 	// Compute u1 = x1 (in t1) and s1 = y1 (in t3).
    842 	t1[..] = p1.x[..];
    843 	t3[..] = p1.y[..];
    844 
    845 	// Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
    846 	square_f256(t4, p1.z);
    847 	mul_f256(t2, p2.x, t4);
    848 	mul_f256(t5, p1.z, t4);
    849 	mul_f256(t4, p2.y, t5);
    850 
    851 	// Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
    852 	// We need to test whether r is zero, so we will do some extra
    853 	// reduce.
    854 	sub_f256(t2, t2, t1);
    855 	sub_f256(t4, t4, t3);
    856 	reduce_final_f256(t4);
    857 	ret = 0;
    858 	for (let i = 0; i < 9; i += 1) {
    859 		ret |= t4[i];
    860 	};
    861 	ret = (ret | -ret) >> 31;
    862 
    863 	// Compute u1*h^2 (in t6) and h^3 (in t5);
    864 	square_f256(t7, t2);
    865 	mul_f256(t6, t1, t7);
    866 	mul_f256(t5, t7, t2);
    867 
    868 	// Compute x3 = r^2 - h^3 - 2*u1*h^2.
    869 	square_f256(p1.x, t4);
    870 	sub_f256(p1.x, p1.x, t5);
    871 	sub_f256(p1.x, p1.x, t6);
    872 	sub_f256(p1.x, p1.x, t6);
    873 
    874 	// Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
    875 	sub_f256(t6, t6, p1.x);
    876 	mul_f256(p1.y, t4, t6);
    877 	mul_f256(t1, t5, t3);
    878 	sub_f256(p1.y, p1.y, t1);
    879 
    880 	// Compute z3 = h*z1*z2.
    881 	mul_f256(p1.z, p1.z, t2);
    882 
    883 	return ret;
    884 };
    885 
    886 // Decode a P-256 point. This function does not support the point at
    887 // infinity. Returned value is 0 if the point is invalid, 1 otherwise.
    888 fn p256_decode(p: *p256_jacobian, src: const []u8) u32 = {
    889 	let tx: [9]u32 = [0...];
    890 	let ty: [9]u32 = [0...];
    891 	let t1: [9]u32 = [0...];
    892 	let t2: [9]u32 = [0...];
    893 	let bad: u32 = 0;
    894 
    895 	if (len(src) != 65) {
    896 		return 0;
    897 	};
    898 	let buf = src;
    899 
    900 	// First byte must be 0x04 (uncompressed format). We could support
    901 	// "hybrid format" (first byte is 0x06 or 0x07, and encodes the
    902 	// least significant bit of the Y coordinate), but it is explicitly
    903 	// forbidden by RFC 5480 (section 2.2).
    904 	bad = nequ32(buf[0], 0x04);
    905 
    906 	// Decode the coordinates, and check that they are both lower
    907 	// than the modulus.
    908 	tx[8] = be8tole30(tx, buf[1..33]);
    909 	ty[8] = be8tole30(ty, buf[33..]);
    910 	bad |= reduce_final_f256(tx);
    911 	bad |= reduce_final_f256(ty);
    912 
    913 	// Check curve equation.
    914 	square_f256(t1, tx);
    915 	mul_f256(t1, tx, t1);
    916 	square_f256(t2, ty);
    917 	sub_f256(t1, t1, tx);
    918 	sub_f256(t1, t1, tx);
    919 	sub_f256(t1, t1, tx);
    920 	add_f256(t1, t1, P256_B);
    921 	sub_f256(t1, t1, t2);
    922 	reduce_final_f256(t1);
    923 	for (let i = 0; i < 9; i += 1) {
    924 		bad |= t1[i];
    925 	};
    926 
    927 	// Copy coordinates to the point structure.
    928 	p.x[..] = tx[..];
    929 	p.y[..] = ty[..];
    930 	p.z[..] = [0...];
    931 	p.z[0] = 1;
    932 	return equ32(bad, 0);
    933 };
    934 
    935 // Encode a point into a buffer. This function assumes that the point is
    936 // valid, in affine coordinates, and not the point at infinity.
    937 fn p256_encode(dest: []u8, p: *p256_jacobian) void = {
    938 	dest[0] = 0x04;
    939 	le30tobe8(dest[1..33], p.x);
    940 	le30tobe8(dest[33..], p.y);
    941 };
    942 
    943 // Multiply a curve point by an integer. The integer is assumed to be
    944 // lower than the curve order, and the base point must not be the point
    945 // at infinity.
    946 fn p256_mul(p: *p256_jacobian, x: []u8) void = {
    947 	// qz is a flag that is initially 1, and remains equal to 1
    948 	// as long as the point is the point at infinity.
    949 	//
    950 	// We use a 2-bit window to handle multiplier bits by pairs.
    951 	// The precomputed window really is the points P2 and P3.
    952 	let qz: u32 = 1;
    953 	let p2 = p256_jacobian { ... };
    954 	let p3 = p256_jacobian { ... };
    955 	let q = p256_jacobian { ... };
    956 	let t = p256_jacobian { ... };
    957 	let u = p256_jacobian { ... };
    958 	let xpos: size = 0;
    959 
    960 	// Compute window values.
    961 	p2 = *p;
    962 	p256_double(&p2);
    963 	p3 = *p;
    964 	p256_add(&p3, &p2);
    965 
    966 	// We start with Q = 0. We process multiplier bits 2 by 2.
    967 	for (let i = len(x); i > 0; i -= 1) {
    968 		for (let k = 6i8; k >= 0; k -= 2) {
    969 			let bits: u32 = 0;
    970 			let bnz: u32 = 0;
    971 
    972 			p256_double(&q);
    973 			p256_double(&q);
    974 			t = *p;
    975 			u = q;
    976 			bits = (x[xpos] >> k: u8) & 3u32;
    977 			bnz = nequ32(bits, 0);
    978 			jaccopy(equ32(bits, 2), &t, &p2);
    979 			jaccopy(equ32(bits, 3), &t, &p3);
    980 			p256_add(&u, &t);
    981 			jaccopy(bnz & qz, &q, &t);
    982 			jaccopy(bnz & ~qz, &q, &u);
    983 			qz &= ~bnz;
    984 		};
    985 		xpos += 1;
    986 	};
    987 	*p = q;
    988 };
    989 
    990 fn jaccopy(ctl: u32, dest: *p256_jacobian, src: *p256_jacobian) void = {
    991 	ccopyu32(ctl, dest.x, src.x);
    992 	ccopyu32(ctl, dest.y, src.y);
    993 	ccopyu32(ctl, dest.z, src.z);
    994 };
    995 
    996 // Precomputed window: k*G points, where G is the curve generator, and k
    997 // is an integer from 1 to 15 (inclusive). The X and Y coordinates of
    998 // the point are encoded as 9 words of 30 bits each (little-endian
    999 // order).
   1000 const gwin: [15][18]u32 = [
   1001 	[
   1002 		0x1898c296, 0x1284e517, 0x1eb33a0f, 0x00df604b, 0x2440f277,
   1003 		0x339b958e, 0x04247f8b, 0x347cb84b, 0x00006b17, 0x37bf51f5,
   1004 		0x2ed901a0, 0x3315ecec, 0x338cd5da, 0x0f9e162b, 0x1fad29f0,
   1005 		0x27f9b8ee, 0x10b8bf86, 0x00004fe3,
   1006 	],
   1007 	[
   1008 		0x07669978, 0x182d23f1, 0x3f21b35a, 0x225a789d, 0x351ac3c0,
   1009 		0x08e00c12, 0x34f7e8a5, 0x1ec62340, 0x00007cf2, 0x227873d1,
   1010 		0x3812de74, 0x0e982299, 0x1f6b798f, 0x3430dbba, 0x366b1a7d,
   1011 		0x2d040293, 0x154436e3, 0x00000777,
   1012 	],
   1013 	[
   1014 		0x06e7fd6c, 0x2d05986f, 0x3ada985f, 0x31adc87b, 0x0bf165e6,
   1015 		0x1fbe5475, 0x30a44c8f, 0x3934698c, 0x00005ecb, 0x227d5032,
   1016 		0x29e6c49e, 0x04fb83d9, 0x0aac0d8e, 0x24a2ecd8, 0x2c1b3869,
   1017 		0x0ff7e374, 0x19031266, 0x00008734,
   1018 	],
   1019 	[
   1020 		0x2b030852, 0x024c0911, 0x05596ef5, 0x07f8b6de, 0x262bd003,
   1021 		0x3779967b, 0x08fbba02, 0x128d4cb4, 0x0000e253, 0x184ed8c6,
   1022 		0x310b08fc, 0x30ee0055, 0x3f25b0fc, 0x062d764e, 0x3fb97f6a,
   1023 		0x33cc719d, 0x15d69318, 0x0000e0f1,
   1024 	],
   1025 	[
   1026 		0x03d033ed, 0x05552837, 0x35be5242, 0x2320bf47, 0x268fdfef,
   1027 		0x13215821, 0x140d2d78, 0x02de9454, 0x00005159, 0x3da16da4,
   1028 		0x0742ed13, 0x0d80888d, 0x004bc035, 0x0a79260d, 0x06fcdafe,
   1029 		0x2727d8ae, 0x1f6a2412, 0x0000e0c1,
   1030 	],
   1031 	[
   1032 		0x3c2291a9, 0x1ac2aba4, 0x3b215b4c, 0x131d037a, 0x17dde302,
   1033 		0x0c90b2e2, 0x0602c92d, 0x05ca9da9, 0x0000b01a, 0x0fc77fe2,
   1034 		0x35f1214e, 0x07e16bdf, 0x003ddc07, 0x2703791c, 0x3038b7ee,
   1035 		0x3dad56fe, 0x041d0c8d, 0x0000e85c,
   1036 	],
   1037 	[
   1038 		0x3187b2a3, 0x0018a1c0, 0x00fef5b3, 0x3e7e2e2a, 0x01fb607e,
   1039 		0x2cc199f0, 0x37b4625b, 0x0edbe82f, 0x00008e53, 0x01f400b4,
   1040 		0x15786a1b, 0x3041b21c, 0x31cd8cf2, 0x35900053, 0x1a7e0e9b,
   1041 		0x318366d0, 0x076f780c, 0x000073eb,
   1042 	],
   1043 	[
   1044 		0x1b6fb393, 0x13767707, 0x3ce97dbb, 0x348e2603, 0x354cadc1,
   1045 		0x09d0b4ea, 0x1b053404, 0x1de76fba, 0x000062d9, 0x0f09957e,
   1046 		0x295029a8, 0x3e76a78d, 0x3b547dae, 0x27cee0a2, 0x0575dc45,
   1047 		0x1d8244ff, 0x332f647a, 0x0000ad5a,
   1048 	],
   1049 	[
   1050 		0x10949ee0, 0x1e7a292e, 0x06df8b3d, 0x02b2e30b, 0x31f8729e,
   1051 		0x24e35475, 0x30b71878, 0x35edbfb7, 0x0000ea68, 0x0dd048fa,
   1052 		0x21688929, 0x0de823fe, 0x1c53faa9, 0x0ea0c84d, 0x052a592a,
   1053 		0x1fce7870, 0x11325cb2, 0x00002a27,
   1054 	],
   1055 	[
   1056 		0x04c5723f, 0x30d81a50, 0x048306e4, 0x329b11c7, 0x223fb545,
   1057 		0x085347a8, 0x2993e591, 0x1b5aca8e, 0x0000cef6, 0x04af0773,
   1058 		0x28d2eea9, 0x2751eeec, 0x037b4a7f, 0x3b4c1059, 0x08f37674,
   1059 		0x2ae906e1, 0x18a88a6a, 0x00008786,
   1060 	],
   1061 	[
   1062 		0x34bc21d1, 0x0cce474d, 0x15048bf4, 0x1d0bb409, 0x021cda16,
   1063 		0x20de76c3, 0x34c59063, 0x04ede20e, 0x00003ed1, 0x282a3740,
   1064 		0x0be3bbf3, 0x29889dae, 0x03413697, 0x34c68a09, 0x210ebe93,
   1065 		0x0c8a224c, 0x0826b331, 0x00009099,
   1066 	],
   1067 	[
   1068 		0x0624e3c4, 0x140317ba, 0x2f82c99d, 0x260c0a2c, 0x25d55179,
   1069 		0x194dcc83, 0x3d95e462, 0x356f6a05, 0x0000741d, 0x0d4481d3,
   1070 		0x2657fc8b, 0x1ba5ca71, 0x3ae44b0d, 0x07b1548e, 0x0e0d5522,
   1071 		0x05fdc567, 0x2d1aa70e, 0x00000770,
   1072 	],
   1073 	[
   1074 		0x06072c01, 0x23857675, 0x1ead58a9, 0x0b8a12d9, 0x1ee2fc79,
   1075 		0x0177cb61, 0x0495a618, 0x20deb82b, 0x0000177c, 0x2fc7bfd8,
   1076 		0x310eef8b, 0x1fb4df39, 0x3b8530e8, 0x0f4e7226, 0x0246b6d0,
   1077 		0x2a558a24, 0x163353af, 0x000063bb,
   1078 	],
   1079 	[
   1080 		0x24d2920b, 0x1c249dcc, 0x2069c5e5, 0x09ab2f9e, 0x36df3cf1,
   1081 		0x1991fd0c, 0x062b97a7, 0x1e80070e, 0x000054e7, 0x20d0b375,
   1082 		0x2e9f20bd, 0x35090081, 0x1c7a9ddc, 0x22e7c371, 0x087e3016,
   1083 		0x03175421, 0x3c6eca7d, 0x0000f599,
   1084 	],
   1085 	[
   1086 		0x259b9d5f, 0x0d9a318f, 0x23a0ef16, 0x00ebe4b7, 0x088265ae,
   1087 		0x2cde2666, 0x2bae7adf, 0x1371a5c6, 0x0000f045, 0x0d034f36,
   1088 		0x1f967378, 0x1b5fa3f4, 0x0ec8739d, 0x1643e62a, 0x1653947e,
   1089 		0x22d1f4e6, 0x0fb8d64b, 0x0000b5b9,
   1090 	],
   1091 ];
   1092 
   1093 // Lookup one of the Gwin[] values, by index. This is constant-time.
   1094 fn lookup_gwin(t: *p256_jacobian, idx: u32) void = {
   1095 	let xy: [18]u32 = [0...];
   1096 	let k: u32 = 0;
   1097 	let u: size = 0;
   1098 
   1099 	for (let k = 0u32; k < 15; k += 1) {
   1100 		let m = -equ32(idx, k + 1);
   1101 		for (let u = 0z; u < 18; u += 1) {
   1102 			xy[u] |= m & gwin[k][u];
   1103 		};
   1104 	};
   1105 	t.x[..] = xy[..9];
   1106 	t.y[..] = xy[9..];
   1107 	t.z[..] = [0...];
   1108 	t.z[0] = 1;
   1109 };
   1110 
   1111 // Multiply the generator by an integer. The integer is assumed non-zero
   1112 // and lower than the curve order.
   1113 fn p256_mulgen(p :*p256_jacobian, x: []u8) void = {
   1114 	// qz is a flag that is initially 1, and remains equal to 1
   1115 	// as long as the point is the point at infinity.
   1116 	//
   1117 	// We use a 4-bit window to handle multiplier bits by groups
   1118 	// of 4. The precomputed window is constant static data, with
   1119 	// points in affine coordinates; we use a constant-time lookup.
   1120 	let q = p256_jacobian { ... };
   1121 	let qz: u32 = 1;
   1122 	let xpos: size = 0;
   1123 
   1124 	for (let i = len(x); i > 0; i -= 1) {
   1125 		let bx = x[xpos]: u32;
   1126 		xpos += 1;
   1127 
   1128 		for (let k = 0z; k < 2; k += 1) {
   1129 			let bits: u32 = 0;
   1130 			let bnz: u32 = 0;
   1131 			let t = p256_jacobian { ... };
   1132 			let u = p256_jacobian { ... };
   1133 
   1134 			p256_double(&q);
   1135 			p256_double(&q);
   1136 			p256_double(&q);
   1137 			p256_double(&q);
   1138 			bits = (bx >> 4) & 0x0f;
   1139 			bnz = nequ32(bits, 0);
   1140 			lookup_gwin(&t, bits);
   1141 			u = q;
   1142 			p256_add_mixed(&u, &t);
   1143 			jaccopy(bnz & qz, &q, &t);
   1144 			jaccopy(bnz & ~qz, &q, &u);
   1145 			qz &= ~bnz;
   1146 			bx <<= 4;
   1147 		};
   1148 	};
   1149 	*p = q;
   1150 };
   1151 
   1152 const P256_G: [_]u8 = [
   1153 	0x04, 0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47, 0xf8, 0xbc, 0xe6,
   1154 	0xe5, 0x63, 0xa4, 0x40, 0xf2, 0x77, 0x03, 0x7d, 0x81, 0x2d, 0xeb, 0x33,
   1155 	0xa0, 0xf4, 0xa1, 0x39, 0x45, 0xd8, 0x98, 0xc2, 0x96, 0x4f, 0xe3, 0x42,
   1156 	0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, 0x7c, 0x0f, 0x9e,
   1157 	0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, 0xcb, 0xb6, 0x40,
   1158 	0x68, 0x37, 0xbf, 0x51, 0xf5
   1159 ];
   1160 
   1161 const P256_N: [_]u8 = [
   1162 	0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
   1163 	0xff, 0xff, 0xff, 0xff, 0xbc, 0xe6, 0xfa, 0xad, 0xa7, 0x17, 0x9e, 0x84,
   1164 	0xf3, 0xb9, 0xca, 0xc2, 0xfc, 0x63, 0x25, 0x51
   1165 ];
   1166 
   1167 fn api256_mul(g: []u8, x: []u8) u32 = {
   1168 	if (len(g) != 65) {
   1169 		return 0;
   1170 	};
   1171 
   1172 	let p = p256_jacobian { ... };
   1173 	let r = p256_decode(&p, g);
   1174 
   1175 	p256_mul(&p, x);
   1176 	p256_to_affine(&p);
   1177 	p256_encode(g, &p);
   1178 	return r;
   1179 };
   1180 
   1181 fn api256_mulgen(r: []u8, x: []u8) size = {
   1182 	let p = p256_jacobian { ... };
   1183 
   1184 	p256_mulgen(&p, x);
   1185 	p256_to_affine(&p);
   1186 	p256_encode(r[..65], &p);
   1187 	return 65;
   1188 };
   1189 
   1190 fn api256_muladd(a: []u8, b: []u8, x: []u8, y: []u8) u32 ={
   1191 	let p = p256_jacobian { ... };
   1192 	let q = p256_jacobian { ... };
   1193 
   1194 	if (len(a) != 65) {
   1195 		return 0;
   1196 	};
   1197 
   1198 	let r = p256_decode(&p, a);
   1199 	p256_mul(&p, x);
   1200 	if (len(b) == 0) {
   1201 		p256_mulgen(&q, y);
   1202 	} else {
   1203 		r &= p256_decode(&q, b);
   1204 		p256_mul(&q, y);
   1205 	};
   1206 
   1207 	// The final addition may fail in case both points are equal.
   1208 	let t = p256_add(&p, &q);
   1209 	reduce_final_f256(p.z);
   1210 	let z: u32 = 0;
   1211 	for (let i = 0z; i < 9; i += 1) {
   1212 		z |= p.z[i];
   1213 	};
   1214 	z = equ32(z, 0);
   1215 	p256_double(&q);
   1216 
   1217 	// If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
   1218 	// have the following:
   1219 	//
   1220 	//   z = 0, t = 0   return P (normal addition)
   1221 	//   z = 0, t = 1   return P (normal addition)
   1222 	//   z = 1, t = 0   return Q (a 'double' case)
   1223 	//   z = 1, t = 1   report an error (P+Q = 0)
   1224 	jaccopy(z & ~t, &p, &q);
   1225 	p256_to_affine(&p);
   1226 	p256_encode(a, &p);
   1227 	r &= ~(z & t);
   1228 	return r;
   1229 };
   1230 
   1231 fn api256_order() const []u8 = P256_N;
   1232 fn api256_generator() const []u8 = P256_G;
   1233 
   1234 const _p256: curve = curve {
   1235 	pointsz = P256_POINTSZ,
   1236 	order = &api256_order,
   1237 	generator = &api256_generator,
   1238 	mul = &api256_mul,
   1239 	mulgen = &api256_mulgen,
   1240 	muladd = &api256_muladd,
   1241 	keygen = &mask_keygen,
   1242 };
   1243 
   1244 // Size of a [[p256]] point in bytes.
   1245 export def P256_POINTSZ = len(P256_G);
   1246 
   1247 // Size of a [[p256]] scalar in bytes.
   1248 export def P256_SCALARSZ = len(P256_N);
   1249 
   1250 // A [[curve]] implementation of P-256, also known as secp256r1 or prime256v1.
   1251 export const p256: *curve = &_p256;