hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

ftos_ryu.ha (14668B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 // Using Ryƫ: fast float-to-string conversion algorithm by Ulf Adams.
      5 // https://doi.org/10.1145/3192366.3192369
      6 // This Hare implementation is translated from the original
      7 // C implementation here: https://github.com/ulfjack/ryu
      8 
      9 use ascii;
     10 use math;
     11 use types;
     12 
     13 type r128 = struct {
     14 	hi: u64,
     15 	lo: u64,
     16 };
     17 
     18 // TODO: use 128-bit integers when implemented
     19 fn u128mul(a: u64, b: u64) r128 = {
     20 	const a0 = a: u32: u64, a1 = a >> 32;
     21 	const b0 = b: u32: u64, b1 = b >> 32;
     22 	const p00 = a0 * b0, p01 = a0 * b1, p10 = a1 * b0, p11 = a1 * b1;
     23 	const p00_lo = p00: u32: u64, p00_hi = p00 >> 32;
     24 	const mid1 = p10 + p00_hi;
     25 	const mid1_lo = mid1: u32: u64, mid1_hi = mid1 >> 32;
     26 	const mid2 = p01 + mid1_lo;
     27 	const mid2_lo = mid2: u32: u64, mid2_hi = mid2 >> 32;
     28 	const r_hi = p11 + mid1_hi + mid2_hi;
     29 	const r_lo = (mid2_lo << 32) | p00_lo;
     30 	return r128 { hi = r_hi, lo = r_lo };
     31 };
     32 
     33 // TODO: Same as above
     34 fn u128rshift(lo: u64, hi: u64, s: u32) u64 = {
     35 	assert(0 <= s);
     36 	assert(s <= 64);
     37 	return (hi << (64 - s)) | (lo >> s);
     38 };
     39 
     40 fn pow5fac(value: u64) u32 = {
     41 	const m_inv_5: u64 = 14757395258967641293; // 5 * m_inv_5 = 1 (mod 2^64)
     42 	const n_div_5: u64 = 3689348814741910323;
     43 	let count: u32 = 0;
     44 	for (true) {
     45 		assert(value != 0);
     46 		value *= m_inv_5;
     47 		if (value > n_div_5) break;
     48 		count += 1;
     49 	};
     50 	return count;
     51 };
     52 
     53 fn pow5fac32(value: u32) u32 = {
     54 	let count: u32 = 0;
     55 	for (true) {
     56 		assert(value != 0);
     57 		const q = value / 5, r = value % 5;
     58 		if (r != 0) break;
     59 		value = q;
     60 		count += 1;
     61 	};
     62 	return count;
     63 };
     64 
     65 fn ibool(b: bool) u8 = if (b) 1 else 0;
     66 
     67 fn pow5multiple(v: u64, p: u32) bool = pow5fac(v) >= p;
     68 fn pow5multiple32(v: u32, p: u32) bool = pow5fac32(v) >= p;
     69 
     70 fn pow2multiple(v: u64, p: u32) bool = {
     71 	assert(v > 0);
     72 	assert(p < 64);
     73 	return (v & ((1u64 << p) - 1)) == 0;
     74 };
     75 
     76 fn pow2multiple32(v: u32, p: u32) bool = {
     77 	assert(v > 0);
     78 	assert(p < 32);
     79 	return (v & ((1u32 << p) - 1)) == 0;
     80 };
     81 
     82 fn mulshift64(m: u64, mul: (u64, u64), j: u32) u64 = {
     83 	// m is maximum 55 bits
     84 	let r0 = u128mul(m, mul.0), r1 = u128mul(m, mul.1);
     85 	const sum = r1.lo + r0.hi;
     86 	r1.hi += ibool(sum < r0.hi);
     87 	return u128rshift(sum, r1.hi, j - 64);
     88 };
     89 
     90 fn mulshiftall64(
     91 	m: u64,
     92 	mul: (u64, u64),
     93 	j: i32,
     94 	mm_shift: u32,
     95 ) (u64, u64, u64) = {
     96 	m <<= 1;
     97 	const r0 = u128mul(m, mul.0), r1 = u128mul(m, mul.1);
     98 	const lo = r0.lo, tmp = r0.hi, mid = tmp + r1.lo;
     99 	const hi = r1.hi + ibool(mid < tmp);
    100 	const lo2 = lo + mul.0;
    101 	const mid2 = mid + mul.1 + ibool(lo2 < lo);
    102 	const hi2 = hi + ibool(mid2 < mid);
    103 	const v_plus = u128rshift(mid2, hi2, (j - 64 - 1): u32);
    104 	const v_minus = if (mm_shift == 1) {
    105 		const lo3 = lo - mul.0;
    106 		const mid3 = mid - mul.1 - ibool(lo3 > lo);
    107 		const hi3 = hi - ibool(mid3 > mid);
    108 		yield u128rshift(mid3, hi3, (j - 64 - 1): u32);
    109 	} else {
    110 		const lo3 = lo + lo;
    111 		const mid3 = mid + mid + ibool(lo3 < lo);
    112 		const hi3 = hi + hi + ibool(mid3 < mid);
    113 		const lo4 = lo3 - mul.0;
    114 		const mid4 = mid3 - mul.1 - ibool(lo4 > lo3);
    115 		const hi4 = hi3 - ibool(mid4 > mid3);
    116 		yield u128rshift(mid4, hi4, (j - 64): u32);
    117 	};
    118 	const v_rounded = u128rshift(mid, hi, (j - 64 - 1): u32);
    119 	return (v_plus, v_rounded, v_minus);
    120 };
    121 
    122 fn mulshift32(m: u32, a: u64, s: u32) u32 = {
    123 	assert(s > 32);
    124 	const a_lo = a: u32: u64, a_hi = a >> 32;
    125 	const b0 = m * a_lo, b1 = m * a_hi;
    126 	const sum = (b0 >> 32) + b1, ss = sum >> (s - 32);
    127 	assert(ss <= types::U32_MAX);
    128 	return ss: u32;
    129 };
    130 
    131 fn mulpow5inv_divpow2(m: u32, q: u32, j: i32) u32 = {
    132 	const pow5 = f64computeinvpow5(q);
    133 	return mulshift32(m, pow5.1 + 1, j: u32);
    134 };
    135 
    136 fn mulpow5_divpow2(m: u32, i: u32, j: i32) u32 = {
    137 	const pow5 = f64computepow5(i);
    138 	return mulshift32(m, pow5.1, j: u32);
    139 };
    140 
    141 fn log2pow5(e: u32) u32 = {
    142 	assert(e <= 3528);
    143 	return ((e * 1217359) >> 19);
    144 };
    145 
    146 fn ceil_log2pow5(e: u32) u32 = log2pow5(e) + 1;
    147 
    148 fn pow5bits(e: u32) u32 = ceil_log2pow5(e);
    149 
    150 fn log10pow2(e: u32) u32 = {
    151 	assert(e <= 1650);
    152 	return ((e * 78913) >> 18);
    153 };
    154 
    155 fn log10pow5(e: u32) u32 = {
    156 	assert(e <= 2620);
    157 	return ((e * 732923) >> 20);
    158 };
    159 
    160 def F64_POW5_INV_BITCOUNT: u8 = 125;
    161 def F64_POW5_BITCOUNT: u8 = 125;
    162 
    163 def F32_POW5_INV_BITCOUNT: u8 = F64_POW5_INV_BITCOUNT - 64;
    164 def F32_POW5_BITCOUNT: u8 = F64_POW5_BITCOUNT - 64;
    165 
    166 const F64_POW5_INV_SPLIT2: [15][2]u64 = [
    167 	[1, 2305843009213693952],
    168 	[5955668970331000884, 1784059615882449851],
    169 	[8982663654677661702, 1380349269358112757],
    170 	[7286864317269821294, 2135987035920910082],
    171 	[7005857020398200553, 1652639921975621497],
    172 	[17965325103354776697, 1278668206209430417],
    173 	[8928596168509315048, 1978643211784836272],
    174 	[10075671573058298858, 1530901034580419511],
    175 	[597001226353042382, 1184477304306571148],
    176 	[1527430471115325346, 1832889850782397517],
    177 	[12533209867169019542, 1418129833677084982],
    178 	[5577825024675947042, 2194449627517475473],
    179 	[11006974540203867551, 1697873161311732311],
    180 	[10313493231639821582, 1313665730009899186],
    181 	[12701016819766672773, 2032799256770390445],
    182 ];
    183 
    184 const POW5_INV_OFFSETS: [19]u32 = [
    185 	0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
    186 	0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
    187 	0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
    188 	0x00000000
    189 ];
    190 
    191 const F64_POW5_SPLIT2: [13][2]u64 = [
    192 	[0, 1152921504606846976],
    193 	[0, 1490116119384765625],
    194 	[1032610780636961552, 1925929944387235853],
    195 	[7910200175544436838, 1244603055572228341],
    196 	[16941905809032713930, 1608611746708759036],
    197 	[13024893955298202172, 2079081953128979843],
    198 	[6607496772837067824, 1343575221513417750],
    199 	[17332926989895652603, 1736530273035216783],
    200 	[13037379183483547984, 2244412773384604712],
    201 	[1605989338741628675, 1450417759929778918],
    202 	[9630225068416591280, 1874621017369538693],
    203 	[665883850346957067, 1211445438634777304],
    204 	[14931890668723713708, 1565756531257009982]
    205 ];
    206 
    207 const POW5_OFFSETS: [21]u32 = [
    208 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
    209 	0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
    210 	0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
    211 	0x55559155, 0x51405555, 0x00000105
    212 ];
    213 
    214 def POW5_TABLE_SZ: u8 = 26;
    215 
    216 const POW5_TABLE: [POW5_TABLE_SZ]u64 = [
    217 	1u64, 5u64, 25u64, 125u64, 625u64, 3125u64, 15625u64, 78125u64,
    218 	390625u64, 1953125u64, 9765625u64, 48828125u64, 244140625u64,
    219 	1220703125u64, 6103515625u64, 30517578125u64, 152587890625u64,
    220 	762939453125u64, 3814697265625u64, 19073486328125u64, 95367431640625u64,
    221 	476837158203125u64, 2384185791015625u64, 11920928955078125u64,
    222 	59604644775390625u64, 298023223876953125u64 //, 1490116119384765625u64
    223 ];
    224 
    225 fn f64computeinvpow5(i: u32) (u64, u64) = {
    226 	const base = ((i + POW5_TABLE_SZ - 1) / POW5_TABLE_SZ): u32;
    227 	const base2 = base * POW5_TABLE_SZ;
    228 	const mul = F64_POW5_INV_SPLIT2[base];
    229 	const off = base2 - i;
    230 	if (off == 0) {
    231 		return (mul[0], mul[1]);
    232 	};
    233 	const m = POW5_TABLE[off];
    234 	const r1 = u128mul(m, mul[1]), r0 = u128mul(m, mul[0] - 1);
    235 	let high1 = r1.hi, low1 = r1.lo, high0 = r0.hi, low0 = r0.lo;
    236 	const sum = high0 + low1;
    237 	if (sum < high0) {
    238 		high1 += 1;
    239 	};
    240 	const delta = pow5bits(base2) - pow5bits(i);
    241 	const res0 = u128rshift(low0, sum, delta) + 1 +
    242 		((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
    243 	const res1 = u128rshift(sum, high1, delta);
    244 	return (res0, res1);
    245 };
    246 
    247 fn f64computepow5(i: u32) (u64, u64) = {
    248 	const base = i / POW5_TABLE_SZ, base2 = base * POW5_TABLE_SZ;
    249 	const mul = F64_POW5_SPLIT2[base];
    250 	const off = i - base2;
    251 	if (off == 0) {
    252 		return (mul[0], mul[1]);
    253 	};
    254 	const m = POW5_TABLE[off];
    255 	const r1 = u128mul(m, mul[1]), r0 = u128mul(m, mul[0]);
    256 	let high1 = r1.hi, low1 = r1.lo, high0 = r0.hi, low0 = r0.lo;
    257 	const sum = high0 + low1;
    258 	if (sum < high0) {
    259 		high1 += 1;
    260 	};
    261 	const delta = pow5bits(i) - pow5bits(base2);
    262 	const res0 = u128rshift(low0, sum, delta) +
    263 		((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
    264 	const res1 = u128rshift(sum, high1, delta);
    265 	return (res0, res1);
    266 };
    267 
    268 type decf64 = struct {
    269 	mantissa: u64,
    270 	exponent: i32,
    271 };
    272 
    273 fn f64todecf64(mantissa: u64, exponent: u32) decf64 = {
    274 	let e2 = (math::F64_EXPONENT_BIAS + math::F64_MANTISSA_BITS + 2): i32;
    275 	let m2: u64 = 0;
    276 	if (exponent == 0) {
    277 		e2 = 1 - e2;
    278 		m2 = mantissa;
    279 	} else {
    280 		e2 = (exponent: i32) - e2;
    281 		m2 = (1u64 << math::F64_MANTISSA_BITS) | mantissa;
    282 	};
    283 	const accept_bounds = (m2 & 1) == 0;
    284 	const mv = 4 * m2;
    285 	const mm_shift = ibool(mantissa != 0 || exponent <= 1);
    286 	let vp: u64 = 0, vr: u64 = 0, vm: u64 = 0;
    287 	let e10: i32 = 0;
    288 	let vm_trailing_zeros = false, vr_trailing_zeros = false;
    289 	if (e2 >= 0) {
    290 		const q = log10pow2(e2: u32) - ibool(e2 > 3);
    291 		e10 = q: i32;
    292 		const k = F64_POW5_INV_BITCOUNT + pow5bits(q) - 1;
    293 		const i = -e2 + (q + k): i32;
    294 		let pow5 = f64computeinvpow5(q);
    295 		const res = mulshiftall64(m2, pow5, i, mm_shift);
    296 		vp = res.0; vr = res.1; vm = res.2;
    297 		if (q <= 21) {
    298 			if ((mv - 5 * (mv / 5)) == 0) {
    299 				vr_trailing_zeros = pow5multiple(mv, q);
    300 			} else if (accept_bounds) {
    301 				vm_trailing_zeros = pow5multiple(mv - 1 -
    302 					mm_shift, q);
    303 			} else {
    304 				vp -= ibool(pow5multiple(mv + 2, q));
    305 			};
    306 		};
    307 	} else {
    308 		const q = log10pow5((-e2): u32) - ibool(-e2 > 1);
    309 		e10 = e2 + (q: i32);
    310 		const i = -e2 - (q: i32);
    311 		const k = pow5bits(i: u32): i32 - F64_POW5_BITCOUNT: i32;
    312 		const j = (q: i32) - k;
    313 		let pow5 = f64computepow5(i: u32);
    314 		const res = mulshiftall64(m2, pow5, j, mm_shift);
    315 		vp = res.0; vr = res.1; vm = res.2;
    316 		if (q <= 1) {
    317 			vr_trailing_zeros = true;
    318 			if (accept_bounds) {
    319 				vm_trailing_zeros = mm_shift == 1;
    320 			} else {
    321 				vp -= 1;
    322 			};
    323 		} else if (q < 63) {
    324 			vr_trailing_zeros = pow2multiple(mv, q);
    325 		};
    326 	};
    327 	let removed: i32 = 0, last_removed_digit: u8 = 0;
    328 	let output: u64 = 0;
    329 	if (vm_trailing_zeros || vr_trailing_zeros) {
    330 		for (true) {
    331 			const vpby10 = vp / 10, vmby10 = vm / 10;
    332 			if (vpby10 <= vmby10) break;
    333 			const vmmod10 = (vm: u32) - 10 * (vmby10: u32);
    334 			const vrby10 = vr / 10;
    335 			const vrmod10 = (vr: u32) - 10 * (vrby10: u32);
    336 			vm_trailing_zeros &&= vmmod10 == 0;
    337 			vr_trailing_zeros &&= last_removed_digit == 0;
    338 			last_removed_digit = vrmod10: u8;
    339 			vr = vrby10; vp = vpby10; vm = vmby10;
    340 			removed += 1;
    341 		};
    342 		if (vm_trailing_zeros) {
    343 			for (true) {
    344 				const vmby10 = vm / 10;
    345 				const vmmod10 = (vm: u32) - 10 * (vmby10: u32);
    346 				if (vmmod10 != 0) break;
    347 				const vpby10 = vp / 10, vrby10 = vr / 10;
    348 				const vrmod10 = (vr: u32) - 10 * (vrby10: u32);
    349 				vr_trailing_zeros &&= last_removed_digit == 0;
    350 				last_removed_digit = vrmod10: u8;
    351 				vr = vrby10; vp = vpby10; vm = vmby10;
    352 				removed += 1;
    353 			};
    354 		};
    355 		if (vr_trailing_zeros && last_removed_digit == 5 &&
    356 				(vr & 1 == 0)) {
    357 			// round to even
    358 			last_removed_digit = 4;
    359 		};
    360 		output = vr + ibool((vr == vm &&
    361 			(!accept_bounds || !vm_trailing_zeros)) ||
    362 			last_removed_digit >= 5);
    363 	} else {
    364 		let round_up = false;
    365 		const vpby100 = vp / 100, vmby100 = vm / 100;
    366 		if (vpby100 > vmby100) {
    367 			const vrby100 = vr / 100;
    368 			const vrmod100 = (vr: u32) - 100 * (vrby100: u32);
    369 			round_up = vrmod100 >= 50;
    370 			vr = vrby100; vp = vpby100; vm = vmby100;
    371 			removed += 2;
    372 		};
    373 		for (true) {
    374 			const vmby10 = vm / 10, vpby10 = vp / 10;
    375 			if (vpby10 <= vmby10) break;
    376 			const vrby10 = vr / 10;
    377 			const vrmod10 = (vr: u32) - 10 * (vrby10: u32);
    378 			round_up = vrmod10 >= 5;
    379 			vr = vrby10; vp = vpby10; vm = vmby10;
    380 			removed += 1;
    381 		};
    382 		output = vr + ibool(vr == vm || round_up);
    383 	};
    384 	const exp = e10 + removed;
    385 	return decf64 { exponent = exp, mantissa = output };
    386 };
    387 
    388 type decf32 = struct {
    389 	mantissa: u32,
    390 	exponent: i32,
    391 };
    392 
    393 fn f32todecf32(mantissa: u32, exponent: u32) decf32 = {
    394 	let e2 = (math::F32_EXPONENT_BIAS + math::F32_MANTISSA_BITS + 2): i32;
    395 	let m2: u32 = 0;
    396 	if (exponent == 0) {
    397 		e2 = 1 - e2;
    398 		m2 = mantissa;
    399 	} else {
    400 		e2 = (exponent: i32) - e2;
    401 		m2 = (1u32 << math::F32_MANTISSA_BITS: u32) | mantissa;
    402 	};
    403 	const accept_bounds = (m2 & 1) == 0;
    404 	const mv = 4 * m2, mp = mv + 2;
    405 	const mm_shift = ibool(mantissa != 0 || exponent <= 1);
    406 	const mm = mv - 1 - mm_shift;
    407 	let vr: u32 = 0, vp: u32 = 0, vm: u32 = 0;
    408 	let e10: i32 = 0;
    409 	let vm_trailing_zeroes = false, vr_trailing_zeroes = false;
    410 	let last_removed_digit: u8 = 0;
    411 	if (e2 >= 0) {
    412 		const q = log10pow2(e2: u32);
    413 		e10 = q: i32;
    414 		const k = F32_POW5_INV_BITCOUNT + pow5bits(q) - 1;
    415 		const i = -e2 + (q + k): i32;
    416 		vr = mulpow5inv_divpow2(mv, q, i);
    417 		vp = mulpow5inv_divpow2(mp, q, i);
    418 		vm = mulpow5inv_divpow2(mm, q, i);
    419 		if (q != 0 && (vp - 1) / 10 <= vm / 10) {
    420 			const l = F32_POW5_INV_BITCOUNT + pow5bits(q - 1) - 1;
    421 			last_removed_digit = (mulpow5inv_divpow2(mv, q - 1,
    422 				-e2 + ((q + l): i32) - 1) % 10): u8;
    423 		};
    424 		if (q <= 9) {
    425 			if (mv % 5 == 0) {
    426 				vr_trailing_zeroes = pow5multiple32(mv, q);
    427 			} else if (accept_bounds) {
    428 				vm_trailing_zeroes = pow5multiple32(mm, q);
    429 			} else {
    430 				vp -= ibool(pow5multiple32(mp, q));
    431 			};
    432 		};
    433 	} else {
    434 		const q = log10pow5((-e2): u32);
    435 		e10 = (q: i32) + e2;
    436 		const i = (-e2 - (q: i32)): u32;
    437 		const k = pow5bits(i) - F32_POW5_BITCOUNT;
    438 		let j = (q: i32) - k: i32;
    439 		vr = mulpow5_divpow2(mv, i, j);
    440 		vp = mulpow5_divpow2(mp, i, j);
    441 		vm = mulpow5_divpow2(mm, i, j);
    442 		if (q != 0 && (vp - 1) / 10 <= vm / 10) {
    443 			j = (q: i32) - 1 - (pow5bits(i + 1): i32 -
    444 				F32_POW5_BITCOUNT: i32);
    445 			last_removed_digit = (mulpow5_divpow2(mv,
    446 				(i + 1), j) % 10): u8;
    447 		};
    448 		if (q <= 1) {
    449 			vr_trailing_zeroes = true;
    450 			if (accept_bounds) {
    451 				vm_trailing_zeroes = mm_shift == 1;
    452 			} else {
    453 				vp -= 1;
    454 			};
    455 		} else if (q < 31) {
    456 			vr_trailing_zeroes = pow2multiple32(mv, q - 1);
    457 		};
    458 	};
    459 	let removed: i32 = 0, output: u32 = 0;
    460 	if (vm_trailing_zeroes || vr_trailing_zeroes) {
    461 		for (vp / 10 > vm / 10) {
    462 			vm_trailing_zeroes &&= (vm - (vm / 10) * 10) == 0;
    463 			vr_trailing_zeroes &&= last_removed_digit == 0;
    464 			last_removed_digit = (vr % 10): u8;
    465 			vr /= 10;
    466 			vp /= 10;
    467 			vm /= 10;
    468 			removed += 1;
    469 		};
    470 		if (vm_trailing_zeroes) {
    471 			for (vm % 10 == 0) {
    472 				vr_trailing_zeroes &&= last_removed_digit == 0;
    473 				last_removed_digit = (vr % 10): u8;
    474 				vr /= 10;
    475 				vp /= 10;
    476 				vm /= 10;
    477 				removed += 1;
    478 			};
    479 		};
    480 		if (vr_trailing_zeroes && last_removed_digit == 5 &&
    481 				vr % 2 == 0) {
    482 			// round to even
    483 			last_removed_digit = 4;
    484 		};
    485 		output = vr + ibool((vr == vm &&
    486 			(!accept_bounds || !vm_trailing_zeroes)) ||
    487 			last_removed_digit >= 5);
    488 	} else {
    489 		for (vp / 10 > vm / 10) {
    490 			last_removed_digit = (vr % 10): u8;
    491 			vr /= 10;
    492 			vp /= 10;
    493 			vm /= 10;
    494 			removed += 1;
    495 		};
    496 		output = vr + ibool(vr == vm || last_removed_digit >= 5);
    497 	};
    498 	const exp = e10 + removed;
    499 	return decf32 { mantissa = output, exponent = exp };
    500 };
    501 
    502 def F32_DECIMAL_DIGITS: i32 = 9;
    503 def F64_DECIMAL_DIGITS: i32 = 17;