hare

[hare] The Hare programming language
git clone https://git.torresjrjr.com/hare.git
Log | Files | Refs | README | LICENSE

ftos_ryu.ha (14657B)


      1 // SPDX-License-Identifier: MPL-2.0
      2 // (c) Hare authors <https://harelang.org>
      3 
      4 // Using Ryƫ: fast float-to-string conversion algorithm by Ulf Adams.
      5 // https://doi.org/10.1145/3192366.3192369
      6 // This Hare implementation is translated from the original
      7 // C implementation here: https://github.com/ulfjack/ryu
      8 
      9 use math;
     10 use types;
     11 
     12 type r128 = struct {
     13 	hi: u64,
     14 	lo: u64,
     15 };
     16 
     17 // TODO: use 128-bit integers when implemented
     18 fn u128mul(a: u64, b: u64) r128 = {
     19 	const a0 = a: u32: u64, a1 = a >> 32;
     20 	const b0 = b: u32: u64, b1 = b >> 32;
     21 	const p00 = a0 * b0, p01 = a0 * b1, p10 = a1 * b0, p11 = a1 * b1;
     22 	const p00_lo = p00: u32: u64, p00_hi = p00 >> 32;
     23 	const mid1 = p10 + p00_hi;
     24 	const mid1_lo = mid1: u32: u64, mid1_hi = mid1 >> 32;
     25 	const mid2 = p01 + mid1_lo;
     26 	const mid2_lo = mid2: u32: u64, mid2_hi = mid2 >> 32;
     27 	const r_hi = p11 + mid1_hi + mid2_hi;
     28 	const r_lo = (mid2_lo << 32) | p00_lo;
     29 	return r128 { hi = r_hi, lo = r_lo };
     30 };
     31 
     32 // TODO: Same as above
     33 fn u128rshift(lo: u64, hi: u64, s: u32) u64 = {
     34 	assert(0 <= s);
     35 	assert(s <= 64);
     36 	return (hi << (64 - s)) | (lo >> s);
     37 };
     38 
     39 fn pow5fac(value: u64) u32 = {
     40 	const m_inv_5: u64 = 14757395258967641293; // 5 * m_inv_5 = 1 (mod 2^64)
     41 	const n_div_5: u64 = 3689348814741910323;
     42 	let count: u32 = 0;
     43 	for (true) {
     44 		assert(value != 0);
     45 		value *= m_inv_5;
     46 		if (value > n_div_5) break;
     47 		count += 1;
     48 	};
     49 	return count;
     50 };
     51 
     52 fn pow5fac32(value: u32) u32 = {
     53 	let count: u32 = 0;
     54 	for (true) {
     55 		assert(value != 0);
     56 		const q = value / 5, r = value % 5;
     57 		if (r != 0) break;
     58 		value = q;
     59 		count += 1;
     60 	};
     61 	return count;
     62 };
     63 
     64 fn ibool(b: bool) u8 = if (b) 1 else 0;
     65 
     66 fn pow5multiple(v: u64, p: u32) bool = pow5fac(v) >= p;
     67 fn pow5multiple32(v: u32, p: u32) bool = pow5fac32(v) >= p;
     68 
     69 fn pow2multiple(v: u64, p: u32) bool = {
     70 	assert(v > 0);
     71 	assert(p < 64);
     72 	return (v & ((1u64 << p) - 1)) == 0;
     73 };
     74 
     75 fn pow2multiple32(v: u32, p: u32) bool = {
     76 	assert(v > 0);
     77 	assert(p < 32);
     78 	return (v & ((1u32 << p) - 1)) == 0;
     79 };
     80 
     81 fn mulshift64(m: u64, mul: (u64, u64), j: u32) u64 = {
     82 	// m is maximum 55 bits
     83 	let r0 = u128mul(m, mul.0), r1 = u128mul(m, mul.1);
     84 	const sum = r1.lo + r0.hi;
     85 	r1.hi += ibool(sum < r0.hi);
     86 	return u128rshift(sum, r1.hi, j - 64);
     87 };
     88 
     89 fn mulshiftall64(
     90 	m: u64,
     91 	mul: (u64, u64),
     92 	j: i32,
     93 	mm_shift: u32,
     94 ) (u64, u64, u64) = {
     95 	m <<= 1;
     96 	const r0 = u128mul(m, mul.0), r1 = u128mul(m, mul.1);
     97 	const lo = r0.lo, tmp = r0.hi, mid = tmp + r1.lo;
     98 	const hi = r1.hi + ibool(mid < tmp);
     99 	const lo2 = lo + mul.0;
    100 	const mid2 = mid + mul.1 + ibool(lo2 < lo);
    101 	const hi2 = hi + ibool(mid2 < mid);
    102 	const v_plus = u128rshift(mid2, hi2, (j - 64 - 1): u32);
    103 	const v_minus = if (mm_shift == 1) {
    104 		const lo3 = lo - mul.0;
    105 		const mid3 = mid - mul.1 - ibool(lo3 > lo);
    106 		const hi3 = hi - ibool(mid3 > mid);
    107 		yield u128rshift(mid3, hi3, (j - 64 - 1): u32);
    108 	} else {
    109 		const lo3 = lo + lo;
    110 		const mid3 = mid + mid + ibool(lo3 < lo);
    111 		const hi3 = hi + hi + ibool(mid3 < mid);
    112 		const lo4 = lo3 - mul.0;
    113 		const mid4 = mid3 - mul.1 - ibool(lo4 > lo3);
    114 		const hi4 = hi3 - ibool(mid4 > mid3);
    115 		yield u128rshift(mid4, hi4, (j - 64): u32);
    116 	};
    117 	const v_rounded = u128rshift(mid, hi, (j - 64 - 1): u32);
    118 	return (v_plus, v_rounded, v_minus);
    119 };
    120 
    121 fn mulshift32(m: u32, a: u64, s: u32) u32 = {
    122 	assert(s > 32);
    123 	const a_lo = a: u32: u64, a_hi = a >> 32;
    124 	const b0 = m * a_lo, b1 = m * a_hi;
    125 	const sum = (b0 >> 32) + b1, ss = sum >> (s - 32);
    126 	assert(ss <= types::U32_MAX);
    127 	return ss: u32;
    128 };
    129 
    130 fn mulpow5inv_divpow2(m: u32, q: u32, j: i32) u32 = {
    131 	const pow5 = f64computeinvpow5(q);
    132 	return mulshift32(m, pow5.1 + 1, j: u32);
    133 };
    134 
    135 fn mulpow5_divpow2(m: u32, i: u32, j: i32) u32 = {
    136 	const pow5 = f64computepow5(i);
    137 	return mulshift32(m, pow5.1, j: u32);
    138 };
    139 
    140 fn log2pow5(e: u32) u32 = {
    141 	assert(e <= 3528);
    142 	return ((e * 1217359) >> 19);
    143 };
    144 
    145 fn ceil_log2pow5(e: u32) u32 = log2pow5(e) + 1;
    146 
    147 fn pow5bits(e: u32) u32 = ceil_log2pow5(e);
    148 
    149 fn log10pow2(e: u32) u32 = {
    150 	assert(e <= 1650);
    151 	return ((e * 78913) >> 18);
    152 };
    153 
    154 fn log10pow5(e: u32) u32 = {
    155 	assert(e <= 2620);
    156 	return ((e * 732923) >> 20);
    157 };
    158 
    159 def F64_POW5_INV_BITCOUNT: u8 = 125;
    160 def F64_POW5_BITCOUNT: u8 = 125;
    161 
    162 def F32_POW5_INV_BITCOUNT: u8 = F64_POW5_INV_BITCOUNT - 64;
    163 def F32_POW5_BITCOUNT: u8 = F64_POW5_BITCOUNT - 64;
    164 
    165 const F64_POW5_INV_SPLIT2: [15][2]u64 = [
    166 	[1, 2305843009213693952],
    167 	[5955668970331000884, 1784059615882449851],
    168 	[8982663654677661702, 1380349269358112757],
    169 	[7286864317269821294, 2135987035920910082],
    170 	[7005857020398200553, 1652639921975621497],
    171 	[17965325103354776697, 1278668206209430417],
    172 	[8928596168509315048, 1978643211784836272],
    173 	[10075671573058298858, 1530901034580419511],
    174 	[597001226353042382, 1184477304306571148],
    175 	[1527430471115325346, 1832889850782397517],
    176 	[12533209867169019542, 1418129833677084982],
    177 	[5577825024675947042, 2194449627517475473],
    178 	[11006974540203867551, 1697873161311732311],
    179 	[10313493231639821582, 1313665730009899186],
    180 	[12701016819766672773, 2032799256770390445],
    181 ];
    182 
    183 const POW5_INV_OFFSETS: [19]u32 = [
    184 	0x54544554, 0x04055545, 0x10041000, 0x00400414, 0x40010000, 0x41155555,
    185 	0x00000454, 0x00010044, 0x40000000, 0x44000041, 0x50454450, 0x55550054,
    186 	0x51655554, 0x40004000, 0x01000001, 0x00010500, 0x51515411, 0x05555554,
    187 	0x00000000
    188 ];
    189 
    190 const F64_POW5_SPLIT2: [13][2]u64 = [
    191 	[0, 1152921504606846976],
    192 	[0, 1490116119384765625],
    193 	[1032610780636961552, 1925929944387235853],
    194 	[7910200175544436838, 1244603055572228341],
    195 	[16941905809032713930, 1608611746708759036],
    196 	[13024893955298202172, 2079081953128979843],
    197 	[6607496772837067824, 1343575221513417750],
    198 	[17332926989895652603, 1736530273035216783],
    199 	[13037379183483547984, 2244412773384604712],
    200 	[1605989338741628675, 1450417759929778918],
    201 	[9630225068416591280, 1874621017369538693],
    202 	[665883850346957067, 1211445438634777304],
    203 	[14931890668723713708, 1565756531257009982]
    204 ];
    205 
    206 const POW5_OFFSETS: [21]u32 = [
    207 	0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x40000000, 0x59695995,
    208 	0x55545555, 0x56555515, 0x41150504, 0x40555410, 0x44555145, 0x44504540,
    209 	0x45555550, 0x40004000, 0x96440440, 0x55565565, 0x54454045, 0x40154151,
    210 	0x55559155, 0x51405555, 0x00000105
    211 ];
    212 
    213 def POW5_TABLE_SZ: u8 = 26;
    214 
    215 const POW5_TABLE: [POW5_TABLE_SZ]u64 = [
    216 	1u64, 5u64, 25u64, 125u64, 625u64, 3125u64, 15625u64, 78125u64,
    217 	390625u64, 1953125u64, 9765625u64, 48828125u64, 244140625u64,
    218 	1220703125u64, 6103515625u64, 30517578125u64, 152587890625u64,
    219 	762939453125u64, 3814697265625u64, 19073486328125u64, 95367431640625u64,
    220 	476837158203125u64, 2384185791015625u64, 11920928955078125u64,
    221 	59604644775390625u64, 298023223876953125u64 //, 1490116119384765625u64
    222 ];
    223 
    224 fn f64computeinvpow5(i: u32) (u64, u64) = {
    225 	const base = ((i + POW5_TABLE_SZ - 1) / POW5_TABLE_SZ): u32;
    226 	const base2 = base * POW5_TABLE_SZ;
    227 	const mul = F64_POW5_INV_SPLIT2[base];
    228 	const off = base2 - i;
    229 	if (off == 0) {
    230 		return (mul[0], mul[1]);
    231 	};
    232 	const m = POW5_TABLE[off];
    233 	const r1 = u128mul(m, mul[1]), r0 = u128mul(m, mul[0] - 1);
    234 	let high1 = r1.hi, low1 = r1.lo, high0 = r0.hi, low0 = r0.lo;
    235 	const sum = high0 + low1;
    236 	if (sum < high0) {
    237 		high1 += 1;
    238 	};
    239 	const delta = pow5bits(base2) - pow5bits(i);
    240 	const res0 = u128rshift(low0, sum, delta) + 1 +
    241 		((POW5_INV_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
    242 	const res1 = u128rshift(sum, high1, delta);
    243 	return (res0, res1);
    244 };
    245 
    246 fn f64computepow5(i: u32) (u64, u64) = {
    247 	const base = i / POW5_TABLE_SZ, base2 = base * POW5_TABLE_SZ;
    248 	const mul = F64_POW5_SPLIT2[base];
    249 	const off = i - base2;
    250 	if (off == 0) {
    251 		return (mul[0], mul[1]);
    252 	};
    253 	const m = POW5_TABLE[off];
    254 	const r1 = u128mul(m, mul[1]), r0 = u128mul(m, mul[0]);
    255 	let high1 = r1.hi, low1 = r1.lo, high0 = r0.hi, low0 = r0.lo;
    256 	const sum = high0 + low1;
    257 	if (sum < high0) {
    258 		high1 += 1;
    259 	};
    260 	const delta = pow5bits(i) - pow5bits(base2);
    261 	const res0 = u128rshift(low0, sum, delta) +
    262 		((POW5_OFFSETS[i / 16] >> ((i % 16) << 1)) & 3);
    263 	const res1 = u128rshift(sum, high1, delta);
    264 	return (res0, res1);
    265 };
    266 
    267 type decf64 = struct {
    268 	mantissa: u64,
    269 	exponent: i32,
    270 };
    271 
    272 fn f64todecf64(mantissa: u64, exponent: u32) decf64 = {
    273 	let e2 = (math::F64_EXPONENT_BIAS + math::F64_MANTISSA_BITS + 2): i32;
    274 	let m2: u64 = 0;
    275 	if (exponent == 0) {
    276 		e2 = 1 - e2;
    277 		m2 = mantissa;
    278 	} else {
    279 		e2 = (exponent: i32) - e2;
    280 		m2 = (1u64 << math::F64_MANTISSA_BITS) | mantissa;
    281 	};
    282 	const accept_bounds = (m2 & 1) == 0;
    283 	const mv = 4 * m2;
    284 	const mm_shift = ibool(mantissa != 0 || exponent <= 1);
    285 	let vp: u64 = 0, vr: u64 = 0, vm: u64 = 0;
    286 	let e10: i32 = 0;
    287 	let vm_trailing_zeros = false, vr_trailing_zeros = false;
    288 	if (e2 >= 0) {
    289 		const q = log10pow2(e2: u32) - ibool(e2 > 3);
    290 		e10 = q: i32;
    291 		const k = F64_POW5_INV_BITCOUNT + pow5bits(q) - 1;
    292 		const i = -e2 + (q + k): i32;
    293 		let pow5 = f64computeinvpow5(q);
    294 		const res = mulshiftall64(m2, pow5, i, mm_shift);
    295 		vp = res.0; vr = res.1; vm = res.2;
    296 		if (q <= 21) {
    297 			if ((mv - 5 * (mv / 5)) == 0) {
    298 				vr_trailing_zeros = pow5multiple(mv, q);
    299 			} else if (accept_bounds) {
    300 				vm_trailing_zeros = pow5multiple(mv - 1 -
    301 					mm_shift, q);
    302 			} else {
    303 				vp -= ibool(pow5multiple(mv + 2, q));
    304 			};
    305 		};
    306 	} else {
    307 		const q = log10pow5((-e2): u32) - ibool(-e2 > 1);
    308 		e10 = e2 + (q: i32);
    309 		const i = -e2 - (q: i32);
    310 		const k = pow5bits(i: u32): i32 - F64_POW5_BITCOUNT: i32;
    311 		const j = (q: i32) - k;
    312 		let pow5 = f64computepow5(i: u32);
    313 		const res = mulshiftall64(m2, pow5, j, mm_shift);
    314 		vp = res.0; vr = res.1; vm = res.2;
    315 		if (q <= 1) {
    316 			vr_trailing_zeros = true;
    317 			if (accept_bounds) {
    318 				vm_trailing_zeros = mm_shift == 1;
    319 			} else {
    320 				vp -= 1;
    321 			};
    322 		} else if (q < 63) {
    323 			vr_trailing_zeros = pow2multiple(mv, q);
    324 		};
    325 	};
    326 	let removed: i32 = 0, last_removed_digit: u8 = 0;
    327 	let output: u64 = 0;
    328 	if (vm_trailing_zeros || vr_trailing_zeros) {
    329 		for (true) {
    330 			const vpby10 = vp / 10, vmby10 = vm / 10;
    331 			if (vpby10 <= vmby10) break;
    332 			const vmmod10 = (vm: u32) - 10 * (vmby10: u32);
    333 			const vrby10 = vr / 10;
    334 			const vrmod10 = (vr: u32) - 10 * (vrby10: u32);
    335 			vm_trailing_zeros &&= vmmod10 == 0;
    336 			vr_trailing_zeros &&= last_removed_digit == 0;
    337 			last_removed_digit = vrmod10: u8;
    338 			vr = vrby10; vp = vpby10; vm = vmby10;
    339 			removed += 1;
    340 		};
    341 		if (vm_trailing_zeros) {
    342 			for (true) {
    343 				const vmby10 = vm / 10;
    344 				const vmmod10 = (vm: u32) - 10 * (vmby10: u32);
    345 				if (vmmod10 != 0) break;
    346 				const vpby10 = vp / 10, vrby10 = vr / 10;
    347 				const vrmod10 = (vr: u32) - 10 * (vrby10: u32);
    348 				vr_trailing_zeros &&= last_removed_digit == 0;
    349 				last_removed_digit = vrmod10: u8;
    350 				vr = vrby10; vp = vpby10; vm = vmby10;
    351 				removed += 1;
    352 			};
    353 		};
    354 		if (vr_trailing_zeros && last_removed_digit == 5 &&
    355 				(vr & 1 == 0)) {
    356 			// round to even
    357 			last_removed_digit = 4;
    358 		};
    359 		output = vr + ibool((vr == vm &&
    360 			(!accept_bounds || !vm_trailing_zeros)) ||
    361 			last_removed_digit >= 5);
    362 	} else {
    363 		let round_up = false;
    364 		const vpby100 = vp / 100, vmby100 = vm / 100;
    365 		if (vpby100 > vmby100) {
    366 			const vrby100 = vr / 100;
    367 			const vrmod100 = (vr: u32) - 100 * (vrby100: u32);
    368 			round_up = vrmod100 >= 50;
    369 			vr = vrby100; vp = vpby100; vm = vmby100;
    370 			removed += 2;
    371 		};
    372 		for (true) {
    373 			const vmby10 = vm / 10, vpby10 = vp / 10;
    374 			if (vpby10 <= vmby10) break;
    375 			const vrby10 = vr / 10;
    376 			const vrmod10 = (vr: u32) - 10 * (vrby10: u32);
    377 			round_up = vrmod10 >= 5;
    378 			vr = vrby10; vp = vpby10; vm = vmby10;
    379 			removed += 1;
    380 		};
    381 		output = vr + ibool(vr == vm || round_up);
    382 	};
    383 	const exp = e10 + removed;
    384 	return decf64 { exponent = exp, mantissa = output };
    385 };
    386 
    387 type decf32 = struct {
    388 	mantissa: u32,
    389 	exponent: i32,
    390 };
    391 
    392 fn f32todecf32(mantissa: u32, exponent: u32) decf32 = {
    393 	let e2 = (math::F32_EXPONENT_BIAS + math::F32_MANTISSA_BITS + 2): i32;
    394 	let m2: u32 = 0;
    395 	if (exponent == 0) {
    396 		e2 = 1 - e2;
    397 		m2 = mantissa;
    398 	} else {
    399 		e2 = (exponent: i32) - e2;
    400 		m2 = (1u32 << math::F32_MANTISSA_BITS: u32) | mantissa;
    401 	};
    402 	const accept_bounds = (m2 & 1) == 0;
    403 	const mv = 4 * m2, mp = mv + 2;
    404 	const mm_shift = ibool(mantissa != 0 || exponent <= 1);
    405 	const mm = mv - 1 - mm_shift;
    406 	let vr: u32 = 0, vp: u32 = 0, vm: u32 = 0;
    407 	let e10: i32 = 0;
    408 	let vm_trailing_zeroes = false, vr_trailing_zeroes = false;
    409 	let last_removed_digit: u8 = 0;
    410 	if (e2 >= 0) {
    411 		const q = log10pow2(e2: u32);
    412 		e10 = q: i32;
    413 		const k = F32_POW5_INV_BITCOUNT + pow5bits(q) - 1;
    414 		const i = -e2 + (q + k): i32;
    415 		vr = mulpow5inv_divpow2(mv, q, i);
    416 		vp = mulpow5inv_divpow2(mp, q, i);
    417 		vm = mulpow5inv_divpow2(mm, q, i);
    418 		if (q != 0 && (vp - 1) / 10 <= vm / 10) {
    419 			const l = F32_POW5_INV_BITCOUNT + pow5bits(q - 1) - 1;
    420 			last_removed_digit = (mulpow5inv_divpow2(mv, q - 1,
    421 				-e2 + ((q + l): i32) - 1) % 10): u8;
    422 		};
    423 		if (q <= 9) {
    424 			if (mv % 5 == 0) {
    425 				vr_trailing_zeroes = pow5multiple32(mv, q);
    426 			} else if (accept_bounds) {
    427 				vm_trailing_zeroes = pow5multiple32(mm, q);
    428 			} else {
    429 				vp -= ibool(pow5multiple32(mp, q));
    430 			};
    431 		};
    432 	} else {
    433 		const q = log10pow5((-e2): u32);
    434 		e10 = (q: i32) + e2;
    435 		const i = (-e2 - (q: i32)): u32;
    436 		const k = pow5bits(i) - F32_POW5_BITCOUNT;
    437 		let j = (q: i32) - k: i32;
    438 		vr = mulpow5_divpow2(mv, i, j);
    439 		vp = mulpow5_divpow2(mp, i, j);
    440 		vm = mulpow5_divpow2(mm, i, j);
    441 		if (q != 0 && (vp - 1) / 10 <= vm / 10) {
    442 			j = (q: i32) - 1 - (pow5bits(i + 1): i32 -
    443 				F32_POW5_BITCOUNT: i32);
    444 			last_removed_digit = (mulpow5_divpow2(mv,
    445 				(i + 1), j) % 10): u8;
    446 		};
    447 		if (q <= 1) {
    448 			vr_trailing_zeroes = true;
    449 			if (accept_bounds) {
    450 				vm_trailing_zeroes = mm_shift == 1;
    451 			} else {
    452 				vp -= 1;
    453 			};
    454 		} else if (q < 31) {
    455 			vr_trailing_zeroes = pow2multiple32(mv, q - 1);
    456 		};
    457 	};
    458 	let removed: i32 = 0, output: u32 = 0;
    459 	if (vm_trailing_zeroes || vr_trailing_zeroes) {
    460 		for (vp / 10 > vm / 10) {
    461 			vm_trailing_zeroes &&= (vm - (vm / 10) * 10) == 0;
    462 			vr_trailing_zeroes &&= last_removed_digit == 0;
    463 			last_removed_digit = (vr % 10): u8;
    464 			vr /= 10;
    465 			vp /= 10;
    466 			vm /= 10;
    467 			removed += 1;
    468 		};
    469 		if (vm_trailing_zeroes) {
    470 			for (vm % 10 == 0) {
    471 				vr_trailing_zeroes &&= last_removed_digit == 0;
    472 				last_removed_digit = (vr % 10): u8;
    473 				vr /= 10;
    474 				vp /= 10;
    475 				vm /= 10;
    476 				removed += 1;
    477 			};
    478 		};
    479 		if (vr_trailing_zeroes && last_removed_digit == 5 &&
    480 				vr % 2 == 0) {
    481 			// round to even
    482 			last_removed_digit = 4;
    483 		};
    484 		output = vr + ibool((vr == vm &&
    485 			(!accept_bounds || !vm_trailing_zeroes)) ||
    486 			last_removed_digit >= 5);
    487 	} else {
    488 		for (vp / 10 > vm / 10) {
    489 			last_removed_digit = (vr % 10): u8;
    490 			vr /= 10;
    491 			vp /= 10;
    492 			vm /= 10;
    493 			removed += 1;
    494 		};
    495 		output = vr + ibool(vr == vm || last_removed_digit >= 5);
    496 	};
    497 	const exp = e10 + removed;
    498 	return decf32 { mantissa = output, exponent = exp };
    499 };
    500 
    501 def F32_DECIMAL_DIGITS: i32 = 9;
    502 def F64_DECIMAL_DIGITS: i32 = 17;