From 37c727e80fa50ff35bd809311e7e349c3d639aae Mon Sep 17 00:00:00 2001 From: Toni Wilen Date: Tue, 10 Jan 2017 20:27:12 +0200 Subject: [PATCH] FPU softfloat update. --- fpp.cpp | 979 ++++-- include/fpp-softfloat.h | 541 ++++ od-win32/md-fpp.h | 4 +- od-win32/sysconfig.h | 1 - softfloat/README.txt | 78 + softfloat/SOFTFLOAT-MACROS.H | 743 +++++ softfloat/fpu_constant.h | 80 + softfloat/fsincos.cpp | 652 ++++ softfloat/fyl2x.cpp | 494 +++ softfloat/mamesf.h | 72 + softfloat/milieu.h | 42 + softfloat/softfloat-specialize.h | 554 ++++ softfloat/softfloat.cpp | 5099 ++++++++++++++++++++++++++++++ softfloat/softfloat.h | 485 +++ 14 files changed, 9538 insertions(+), 286 deletions(-) create mode 100644 include/fpp-softfloat.h create mode 100644 softfloat/README.txt create mode 100644 softfloat/SOFTFLOAT-MACROS.H create mode 100644 softfloat/fpu_constant.h create mode 100644 softfloat/fsincos.cpp create mode 100644 softfloat/fyl2x.cpp create mode 100644 softfloat/mamesf.h create mode 100644 softfloat/milieu.h create mode 100644 softfloat/softfloat-specialize.h create mode 100644 softfloat/softfloat.cpp create mode 100644 softfloat/softfloat.h diff --git a/fpp.cpp b/fpp.cpp index 1ef695c5..9d8e93d1 100644 --- a/fpp.cpp +++ b/fpp.cpp @@ -35,9 +35,7 @@ #include "cpummu030.h" #include "debug.h" -#ifdef WITH_SOFTFLOAT -#include "softfloatx80.h" -#endif +#include "softfloat/softfloat.h" #define DEBUG_FPP 0 #define EXCEPTION_FPP 1 @@ -119,6 +117,7 @@ static double *fp_1e4096 = (double *)dhex_inf; static double *fp_inf = (double *)dhex_inf; static double *fp_nan = (double *)dhex_nan; #endif +static const double twoto32 = 4294967296.0; double fp_1e8 = 1.0e8; float fp_1e0 = 1, fp_1e1 = 10, fp_1e2 = 100, fp_1e4 = 10000; static bool fpu_mmu_fixup; @@ -128,12 +127,9 @@ static bool fpu_mmu_fixup; #define FFLAG_NAN 0x0400 -#ifdef WITH_SOFTFLOAT static floatx80 fxsizes[6] = { 0 }; static floatx80 fxzero; static floatx80 fx_1e0, fx_1e1, fx_1e2, fx_1e4, fx_1e8; -struct float_status_t fxstatus; -#endif static const fptype fsizes[] = { -128.0, 127.0, -32768.0, 32767.0, -2147483648.0, 2147483647.0 }; #define FP_INEXACT (1 << 9) @@ -146,181 +142,409 @@ static const fptype fsizes[] = { -128.0, 127.0, -32768.0, 32767.0, -2147483648.0 STATIC_INLINE void MAKE_FPSR (fptype *fp) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) + if (currprefs.fpu_softfloat) { return; -#endif - int status = fetestexcept (FE_ALL_EXCEPT); - if (status) { - if (status & FE_INEXACT) - regs.fp_result_status |= FP_INEXACT; - if (status & FE_DIVBYZERO) - regs.fp_result_status |= FP_DIVBYZERO; - if (status & FE_UNDERFLOW) - regs.fp_result_status |= FP_UNDERFLOW; - if (status & FE_OVERFLOW) - regs.fp_result_status |= FP_OVERFLOW; - if (status & FE_INVALID) - regs.fp_result_status |= FP_OPERAND; - } - regs.fp_result.fp = *fp; -} - -#ifdef WITH_SOFTFLOAT + } else { + int status = fetestexcept (FE_ALL_EXCEPT); + if (status) { + if (status & FE_INEXACT) + regs.fp_result_status |= FP_INEXACT; + if (status & FE_DIVBYZERO) + regs.fp_result_status |= FP_DIVBYZERO; + if (status & FE_UNDERFLOW) + regs.fp_result_status |= FP_UNDERFLOW; + if (status & FE_OVERFLOW) + regs.fp_result_status |= FP_OVERFLOW; + if (status & FE_INVALID) + regs.fp_result_status |= FP_OPERAND; + } + regs.fp_result.fp = *fp; + } +} + STATIC_INLINE void MAKE_FPSR_SOFTFLOAT(floatx80 fx) { - if (fxstatus.float_exception_flags & float_flag_invalid) + if (float_exception_flags & float_flag_invalid) regs.fp_result_status |= FP_OPERAND; - if (fxstatus.float_exception_flags & float_flag_divbyzero) + if (float_exception_flags & float_flag_divbyzero) regs.fp_result_status |= FP_DIVBYZERO; - if (fxstatus.float_exception_flags & float_flag_overflow) + if (float_exception_flags & float_flag_overflow) regs.fp_result_status |= FP_OVERFLOW; - if (fxstatus.float_exception_flags & float_flag_underflow) + if (float_exception_flags & float_flag_underflow) regs.fp_result_status |= FP_UNDERFLOW; - if (fxstatus.float_exception_flags & float_flag_inexact) + if (float_exception_flags & float_flag_inexact) regs.fp_result_status |= FP_INEXACT; regs.fp_result.fpx = fx; } -#endif -STATIC_INLINE void CLEAR_STATUS (void) +static void CLEAR_STATUS (void) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) + if (currprefs.fpu_softfloat) { + float_exception_flags = 0; return; -#endif + } feclearexcept (FE_ALL_EXCEPT); } -#ifdef WITH_SOFTFLOAT -static void softfloat_set(floatx80 *fx, uae_u32 *f) +static void fp_roundsgl(fpdata *fpd) { - fx->exp = (uae_u16)f[2]; - fx->fraction = ((uae_u64)f[1] << 32) | f[0]; + if (currprefs.fpu_softfloat) { + fpd->fpx = floatx80_round32(fpd->fpx); + } else { + int expon; + float mant; +#ifdef USE_LONG_DOUBLE + mant = (float)(frexpl(fpd->fp, &expon) * 2.0); + fpd->fp = ldexpl((fptype)mant, expon - 1); +#else + mant = (float)(frexp(fpd->fp, &expon) * 2.0); + fpd->fp = ldexp((fptype)mant, expon - 1); +#endif + } } -static void softfloat_get(floatx80 *fx, uae_u32 *f) +static void fp_round32(fpdata *fpd) { - f[2] = fx->exp; - f[1] = fx->fraction >> 32; - f[0] = (uae_u32)fx->fraction; + if (currprefs.fpu_softfloat) { + float32 f = floatx80_to_float32(fpd->fpx); + fpd->fpx = float32_to_floatx80(f); + } else { + fpd->fp = (float)fpd->fp; + } } +static void fp_round64(fpdata *fpd) +{ + if (currprefs.fpu_softfloat) { + float64 f = floatx80_to_float64(fpd->fpx); + fpd->fpx = float64_to_floatx80(f); + } else { +#ifdef USE_LONG_DOUBLE + fpd->fp = (double)fpd->fp; +#else + ; #endif + } +} -static void fpnan (fpdata *fpd) +static void to_native(fptype *fp, fpdata *fpd) { - fpd->fp = *fp_nan; -#ifdef WITH_SOFTFLOAT - softfloat_set(&fpd->fpx, xhex_nan); -#endif + if (currprefs.fpu_softfloat) { + int expon; + fptype frac; + floatx80 *fpx = &fpd->fpx; + + expon = fpx->high & 0x7fff; + + if (floatx80_is_zero(*fpx)) { + *fp = floatx80_is_negative(*fpx) ? -0.0 : +0.0; + return; + } + if (floatx80_is_nan(*fpx)) { + *fp = sqrtl(-1); + return; + } + if (floatx80_is_infinity(*fpx)) { + *fp = floatx80_is_negative(*fpx) ? logl(0.0) : (1.0/0.0); + return; + } + + frac = (long double)fpx->low / (long double)(twoto32 * 2147483648.0); + if (floatx80_is_negative(*fpx)) + frac = -frac; + *fp = ldexpl (frac, expon - 16383); + } else { + *fp = fpd->fp; + } } -static void fpclear (fpdata *fpd) +static void from_native(fptype fp, fpdata *fpd) { - fpd->fp = 0; -#ifdef WITH_SOFTFLOAT - fpd->fpx = int32_to_floatx80(0); -#endif + if (currprefs.fpu_softfloat) { + int expon; + fptype frac; + floatx80 *fpx = &fpd->fpx; + + if (signbit(fp)) + fpx->high = 0x8000; + else + fpx->high = 0x0000; + + if (isnan(fp)) { + fpx->high |= 0x7fff; + fpx->low = LIT64(0xffffffffffffffff); + return; + } + if (isinf(fp)) { + fpx->high |= 0x7fff; + fpx->low = LIT64(0x0000000000000000); + return; + } + if (fp == 0.0) { + fpx->low = LIT64(0x0000000000000000); + return; + } + if (fp < 0.0) + fp = -fp; + + frac = frexpl (fp, &expon); + frac += 0.5 / (twoto32 * twoto32); + if (frac >= 1.0) { + frac /= 2.0; + expon++; + } + fpx->high |= (expon + 16383 - 1) & 0x7fff; + fpx->low = (bits64)(frac * (long double)(twoto32 * twoto32)); + + while (!(fpx->low & LIT64( 0x8000000000000000))) { + if (fpx->high == 0) { + float_raise( float_flag_denormal ); + break; + } + fpx->low <<= 1; + fpx->high--; + } + } else { + fpd->fp = fp; + } } -static void fpset (fpdata *fpd, uae_s32 val) + +static void softfloat_set(floatx80 *fx, uae_u32 *f) { - fpd->fp = (fptype)val; -#ifdef WITH_SOFTFLOAT - fpd->fpx = int32_to_floatx80(val); -#endif + fx->high = (uae_u16)f[2]; + fx->low = ((uae_u64)f[1] << 32) | f[0]; +} +static void softfloat_get(floatx80 *fx, uae_u32 *f) +{ + f[2] = fx->high; + f[1] = fx->low >> 32; + f[0] = (uae_u32)fx->low; } -void to_single(fpdata *fpd, uae_u32 value) +static void to_single_xn(fpdata *fpd, uae_u32 wrd1) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - float32 f = value; - fpd->fpx = float32_to_floatx80(f, fxstatus); - } else -#endif - fpd->fp = to_single_x(value); + float32 f = wrd1; + fpd->fpx = float32_to_floatx80(f); // automatically fix denormals + } else { + union { + float f; + uae_u32 u; + } val; + val.u = wrd1; + fpd->fp = (fptype) val.f; + } } -static uae_u32 from_single(fpdata *fpd) +static void to_single_x(fpdata *fpd, uae_u32 wrd1) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - float32 f = floatx80_to_float32(fpd->fpx, fxstatus); + float32 f = wrd1; + fpd->fpx = float32_to_floatx80_allowunnormal(f); + } else { + union { + float f; + uae_u32 u; + } val; + val.u = wrd1; + fpd->fp = (fptype) val.f; + } +} +static uae_u32 from_single_x(fpdata *fpd) +{ + if (currprefs.fpu_softfloat) { + float32 f = floatx80_to_float32(fpd->fpx); return f; - } else + } else { + union { + float f; + uae_u32 u; + } val; + val.f = (float)fpd->fp; + return val.u; + } +} + +static void to_double_xn(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2) +{ + if (currprefs.fpu_softfloat) { + float64 f = ((float64)wrd1 << 32) | wrd2; + fpd->fpx = float64_to_floatx80(f); // automatically fix denormals + } else { + union { + double d; + uae_u32 u[2]; + } val; +#ifdef WORDS_BIGENDIAN + val.u[0] = wrd1; + val.u[1] = wrd2; +#else + val.u[1] = wrd1; + val.u[0] = wrd2; #endif - return from_single_x(fpd->fp); + fpd->fp = (fptype) val.d; + } } -void to_double(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2) +static void to_double_x(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) { + if (currprefs.fpu_softfloat) { float64 f = ((float64)wrd1 << 32) | wrd2; - fpd->fpx = float64_to_floatx80(f, fxstatus); - } else + fpd->fpx = float64_to_floatx80_allowunnormal(f); + } else { + union { + double d; + uae_u32 u[2]; + } val; +#ifdef WORDS_BIGENDIAN + val.u[0] = wrd1; + val.u[1] = wrd2; +#else + val.u[1] = wrd1; + val.u[0] = wrd2; #endif - fpd->fp = to_double_x(wrd1, wrd2); + fpd->fp = (fptype) val.d; + } } -static void from_double(fpdata *fpd, uae_u32 *wrd1, uae_u32 *wrd2) +static void from_double_x(fpdata *fpd, uae_u32 *wrd1, uae_u32 *wrd2) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - float64 f = floatx80_to_float64(fpd->fpx, fxstatus); + float64 f = floatx80_to_float64(fpd->fpx); *wrd1 = f >> 32; *wrd2 = (uae_u32)f; - return; - } else + } else { + union { + double d; + uae_u32 u[2]; + } val; + val.d = (double) fpd->fp; +#ifdef WORDS_BIGENDIAN + *wrd1 = val.u[0]; + *wrd2 = val.u[1]; +#else + *wrd1 = val.u[1]; + *wrd2 = val.u[0]; #endif - return from_double_x(fpd->fp, wrd1, wrd2); + } } -void to_exten(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) +#ifdef USE_LONG_DOUBLE +static void to_exten_x(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - fpd->fpx.exp = wrd1 >> 16; - fpd->fpx.fraction = ((uae_u64)wrd2 << 32) | wrd3; -#if 0 - if ((currprefs.fpu_model == 68881 || currprefs.fpu_model == 68882) || currprefs.fpu_no_unimplemented) { - // automatically fix denormals if 6888x or no implemented emulation - Bit64u Sig = extractFloatx80Frac(fpd->fpx); - Bit32s Exp = extractFloatx80Exp(fpd->fpx); - if (Exp == 0 && Sig != 0) - normalizeFloatx80Subnormal(Sig, &Exp, &Sig); - } -#endif - } else + uae_u32 wrd[3] = { wrd1, wrd2, wrd3 }; + softfloat_set(&fpd->fpx, wrd); + } else { + union { + fptype ld; + uae_u32 u[3]; + } val; +#if WORDS_BIGENDIAN + val.u[0] = (wrd1 & 0xffff0000) | ((wrd2 & 0xffff0000) >> 16); + val.u[1] = (wrd2 & 0x0000ffff) | ((wrd3 & 0xffff0000) >> 16); + val.u[2] = (wrd3 & 0x0000ffff) << 16; +#else + val.u[0] = wrd3; + val.u[1] = wrd2; + val.u[2] = wrd1 >> 16; #endif - to_exten_x(&fpd->fp, wrd1, wrd2, wrd3); + fpd->fp = val.ld; + } } -static void from_exten(fpdata *fpd, uae_u32 * wrd1, uae_u32 * wrd2, uae_u32 * wrd3) +static void from_exten_x(fpdata *fpd, uae_u32 *wrd1, uae_u32 *wrd2, uae_u32 *wrd3) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - *wrd1 = fpd->fpx.exp << 16; - *wrd2 = fpd->fpx.fraction >> 32; - *wrd3 = (uae_u32)fpd->fpx.fraction; - } else + uae_u32 wrd[3]; + softfloat_get(&fpd->fpx, wrd); + *wrd1 = wrd[0]; + *wrd2 = wrd[1]; + *wrd3 = wrd[2]; + } else { + union { + fptype ld; + uae_u32 u[3]; + } val; + val.ld = *fp; +#if WORDS_BIGENDIAN + *wrd1 = val.u[0] & 0xffff0000; + *wrd2 = ((val.u[0] & 0x0000ffff) << 16) | ((val.u[1] & 0xffff0000) >> 16); + *wrd3 = ((val.u[1] & 0x0000ffff) << 16) | ((val.u[2] & 0xffff0000) >> 16); +#else + *wrd3 = val.u[0]; + *wrd2 = val.u[1]; + *wrd1 = val.u[2] << 16; #endif - from_exten_x(fpd->fp, wrd1, wrd2, wrd3); + } } +#else // if !USE_LONG_DOUBLE +static void to_exten_x(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) +{ + if (currprefs.fpu_softfloat) { + uae_u32 wrd[3] = { wrd1, wrd2, wrd3 }; + softfloat_set(&fpd->fpx, wrd); + } else { + fptype frac; + if ((wrd1 & 0x7fff0000) == 0 && wrd2 == 0 && wrd3 == 0) { + fpd->fp = (wrd1 & 0x80000000) ? -0.0 : +0.0; + return; + } + frac = ((fptype)wrd2 + ((fptype)wrd3 / twoto32)) / 2147483648.0; + if (wrd1 & 0x80000000) + frac = -frac; + fpd->fp = ldexp (frac, ((wrd1 >> 16) & 0x7fff) - 16383); + } +} +static void from_exten_x(fpdata *fpd, uae_u32 *wrd1, uae_u32 *wrd2, uae_u32 *wrd3) +{ + if (currprefs.fpu_softfloat) { + uae_u32 wrd[3]; + softfloat_get(&fpd->fpx, wrd); + *wrd1 = wrd[0]; + *wrd2 = wrd[1]; + *wrd3 = wrd[2]; + } else { + int expon; + fptype frac; + fptype v; + + v = fpd->fp; + if (v == 0.0) { + *wrd1 = signbit(v) ? 0x80000000 : 0; + *wrd2 = 0; + *wrd3 = 0; + return; + } + if (v < 0) { + *wrd1 = 0x80000000; + v = -v; + } else { + *wrd1 = 0; + } + frac = frexp (v, &expon); + frac += 0.5 / (twoto32 * twoto32); + if (frac >= 1.0) { + frac /= 2.0; + expon++; + } + *wrd1 |= (((expon + 16383 - 1) & 0x7fff) << 16); + *wrd2 = (uae_u32) (frac * twoto32); + *wrd3 = (uae_u32) ((frac * twoto32 - *wrd2) * twoto32); + } +} +#endif // !USE_LONG_DOUBLE - -#if 0 -static void normalize(uae_u32 *pwrd1, uae_u32 *pwrd2, uae_u32 *pwrd3) +static void normalize_exten(uae_u32 *pwrd1, uae_u32 *pwrd2, uae_u32 *pwrd3) { uae_u32 wrd1 = *pwrd1; uae_u32 wrd2 = *pwrd2; uae_u32 wrd3 = *pwrd3; - int exp = (wrd1 >> 16) & 0x7fff; + uae_u16 exp = (wrd1 >> 16) & 0x7fff; // Normalize if unnormal. if (exp != 0 && exp != 0x7fff && !(wrd2 & 0x80000000)) { while (!(wrd2 & 0x80000000) && (wrd2 || wrd3)) { + if (exp == 0) + break; // Result is denormal wrd2 <<= 1; if (wrd3 & 0x80000000) wrd2 |= 1; wrd3 <<= 1; exp--; } - if (exp < 0) - exp = 0; if (!wrd2 && !wrd3) exp = 0; *pwrd1 = (wrd1 & 0x80000000) | (exp << 16); @@ -328,7 +552,189 @@ static void normalize(uae_u32 *pwrd1, uae_u32 *pwrd2, uae_u32 *pwrd3) *pwrd3 = wrd3; } } + +static void from_int(fpdata *fpd, uae_s32 src) +{ + if (currprefs.fpu_softfloat) { + fpd->fpx = int32_to_floatx80(src); + } else { + fpd->fp = (fptype)src; + } +} + +static void fpclear (fpdata *fpd) +{ + from_int(fpd, 0); +} +static void fpset (fpdata *fpd, uae_s32 val) +{ + from_int(fpd, val); +} + +void to_single(fpdata *fpd, uae_u32 wrd1) +{ + // automatically fix denormals if 6888x + if (currprefs.fpu_model == 68881 || currprefs.fpu_model == 68882) + to_single_xn(fpd, wrd1); + else + to_single_x(fpd, wrd1); +} +static uae_u32 from_single(fpdata *fpd) +{ + return from_single_x(fpd); +} +void to_double(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2) +{ + // automatically fix denormals if 6888x + if (currprefs.fpu_model == 68881 || currprefs.fpu_model == 68882) + to_double_xn(fpd, wrd1, wrd2); + else + to_double_x(fpd, wrd1, wrd2); +} +static void from_double(fpdata *fpd, uae_u32 *wrd1, uae_u32 *wrd2) +{ + from_double_x(fpd, wrd1, wrd2); +} + +void to_exten(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) +{ + // automatically fix unnormals if 6888x + if (currprefs.fpu_model == 68881 || currprefs.fpu_model == 68882) { + normalize_exten(&wrd1, &wrd2, &wrd3); + } + to_exten_x(fpd, wrd1, wrd2, wrd3); +} +static void to_exten_fmovem(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) +{ + to_exten_x(fpd, wrd1, wrd2, wrd3); +} +static void from_exten(fpdata *fpd, uae_u32 * wrd1, uae_u32 * wrd2, uae_u32 * wrd3) +{ + from_exten_x(fpd, wrd1, wrd2, wrd3); +} + +/* Floating Point Control Register (FPCR) + * + * Exception Enable Byte + * x--- ---- ---- ---- bit 15: BSUN (branch/set on unordered) + * -x-- ---- ---- ---- bit 14: SNAN (signaling not a number) + * --x- ---- ---- ---- bit 13: OPERR (operand error) + * ---x ---- ---- ---- bit 12: OVFL (overflow) + * ---- x--- ---- ---- bit 11: UNFL (underflow) + * ---- -x-- ---- ---- bit 10: DZ (divide by zero) + * ---- --x- ---- ---- bit 9: INEX 2 (inexact operation) + * ---- ---x ---- ---- bit 8: INEX 1 (inexact decimal input) + * + * Mode Control Byte + * ---- ---- xx-- ---- bits 7 and 6: PREC (rounding precision) + * ---- ---- --xx ---- bits 5 and 4: RND (rounding mode) + * ---- ---- ---- xxxx bits 3 to 0: all 0 + */ + +#define FPCR_PREC 0x00C0 +#define FPCR_RND 0x0030 + +/* Floating Point Status Register (FPSR) + * + * Condition Code Byte + * xxxx ---- ---- ---- ---- ---- ---- ---- bits 31 to 28: all 0 + * ---- x--- ---- ---- ---- ---- ---- ---- bit 27: N (negative) + * ---- -x-- ---- ---- ---- ---- ---- ---- bit 26: Z (zero) + * ---- --x- ---- ---- ---- ---- ---- ---- bit 25: I (infinity) + * ---- ---x ---- ---- ---- ---- ---- ---- bit 24: NAN (not a number or unordered) + * + * Quotient Byte (set and reset only by FMOD and FREM) + * ---- ---- x--- ---- ---- ---- ---- ---- bit 23: sign of quotient + * ---- ---- -xxx xxxx ---- ---- ---- ---- bits 22 to 16: 7 least significant bits of quotient + * + * Exception Status Byte + * ---- ---- ---- ---- x--- ---- ---- ---- bit 15: BSUN (branch/set on unordered) + * ---- ---- ---- ---- -x-- ---- ---- ---- bit 14: SNAN (signaling not a number) + * ---- ---- ---- ---- --x- ---- ---- ---- bit 13: OPERR (operand error) + * ---- ---- ---- ---- ---x ---- ---- ---- bit 12: OVFL (overflow) + * ---- ---- ---- ---- ---- x--- ---- ---- bit 11: UNFL (underflow) + * ---- ---- ---- ---- ---- -x-- ---- ---- bit 10: DZ (divide by zero) + * ---- ---- ---- ---- ---- --x- ---- ---- bit 9: INEX 2 (inexact operation) + * ---- ---- ---- ---- ---- ---x ---- ---- bit 8: INEX 1 (inexact decimal input) + * + * Accrued Exception Byte + * ---- ---- ---- ---- ---- ---- x--- ---- bit 7: IOP (invalid operation) + * ---- ---- ---- ---- ---- ---- -x-- ---- bit 6: OVFL (overflow) + * ---- ---- ---- ---- ---- ---- --x- ---- bit 5: UNFL (underflow) + * ---- ---- ---- ---- ---- ---- ---x ---- bit 4: DZ (divide by zero) + * ---- ---- ---- ---- ---- ---- ---- x--- bit 3: INEX (inexact) + * ---- ---- ---- ---- ---- ---- ---- -xxx bits 2 to 0: all 0 + */ + +#define FPSR_ZEROBITS 0xF0000007 + +#define FPSR_CC_N 0x08000000 +#define FPSR_CC_Z 0x04000000 +#define FPSR_CC_I 0x02000000 +#define FPSR_CC_NAN 0x01000000 + +#define FPSR_QUOT_SIGN 0x00800000 +#define FPSR_QUOT_LSB 0x007F0000 + +#define FPSR_BSUN 0x00008000 +#define FPSR_SNAN 0x00004000 +#define FPSR_OPERR 0x00002000 +#define FPSR_OVFL 0x00001000 +#define FPSR_UNFL 0x00000800 +#define FPSR_DZ 0x00000400 +#define FPSR_INEX2 0x00000200 +#define FPSR_INEX1 0x00000100 + +#define FPSR_AE_IOP 0x00000080 +#define FPSR_AE_OVFL 0x00000040 +#define FPSR_AE_UNFL 0x00000020 +#define FPSR_AE_DZ 0x00000010 +#define FPSR_AE_INEX 0x00000008 + +static void fpnan (fpdata *fpd) +{ + to_exten(fpd, xhex_nan[0], xhex_nan[1], xhex_nan[2]); +} + +const TCHAR *fp_print(fpdata *fpd) +{ + static TCHAR fs[32]; + if (currprefs.fpu_softfloat) { + bool n, u, d; + fptype result = 0.0; + int i; + floatx80 *fx = &fpd->fpx; + + n = floatx80_is_negative(*fx); + u = floatx80_is_unnormal(*fx); + d = floatx80_is_denormal(*fx); + + if (floatx80_is_zero(*fx)) { + _stprintf(fs, _T("%c%#.17Le%s%s"), n?'-':'+', (fptype) 0.0, u ? _T("U") : _T(""), d ? _T("D") : _T("")); + } else if (floatx80_is_infinity(*fx)) { + _stprintf(fs, _T("%c%s"), n?'-':'+', _T("inf")); + } else if (floatx80_is_signaling_nan(*fx)) { + _stprintf(fs, _T("%c%s"), n?'-':'+', _T("snan")); + } else if (floatx80_is_nan(*fx)) { + _stprintf(fs, _T("%c%s"), n?'-':'+', _T("nan")); + } else { + for (i = 63; i >= 0; i--) { + if (fx->low & (((uae_u64)1)<high&0x7FFF) - 0x3FFF); + _stprintf(fs, _T("%c%#.17Le%s%s"), n?'-':'+', result, u ? _T("U") : _T(""), d ? _T("D") : _T("")); + } + } else { +#if USE_LONG_DOUBLE + _stprintf(fs, _T("#%Le"), fpd->fp); +#else + _stprintf(fs, _T("#%e"), fpd->fp); #endif + } + return fs; +} static bool fpu_get_constant_fp(fpdata *fp, int cr) { @@ -408,7 +814,6 @@ static bool fpu_get_constant_fp(fpdata *fp, int cr) return true; } -#ifdef WITH_SOFTFLOAT static bool fpu_get_constant_softfloat(fpdata *fp, int cr) { uae_u32 *f = NULL; @@ -491,52 +896,36 @@ static bool fpu_get_constant_softfloat(fpdata *fp, int cr) fp->fpx = fx; return true; } -#endif bool fpu_get_constant(fpdata *fp, int cr) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) + if (currprefs.fpu_softfloat) { return fpu_get_constant_softfloat(fp, cr); -#endif - return fpu_get_constant_fp(fp, cr); + } else { + return fpu_get_constant_fp(fp, cr); + } } -#ifdef WITH_SOFTFLOAT - -static inline void set_fpucw_softfloat(uae_u32 m68k_cw) +static void set_fpucw_softfloat(uae_u32 m68k_cw) { - switch((m68k_cw >> 6) & 3) { - case 0: // X - default: // undefined - fxstatus.float_rounding_precision = 80; - break; - case 1: // S - fxstatus.float_rounding_precision = 32; - break; - case 2: // D - fxstatus.float_rounding_precision = 64; - break; - } + floatx80_rounding_precision = 80; switch((m68k_cw >> 4) & 3) { - case 0: // to neareset - fxstatus.float_rounding_precision = float_round_nearest_even; + case 0: // to nearest + float_rounding_mode = float_round_nearest_even; break; case 1: // to zero - fxstatus.float_rounding_mode = float_round_to_zero; + float_rounding_mode = float_round_to_zero; break; case 2: // to minus - fxstatus.float_rounding_mode = float_round_down; + float_rounding_mode = float_round_down; break; case 3: // to plus - fxstatus.float_rounding_mode = float_round_up; + float_rounding_mode = float_round_up; break; } return; } -#endif /* WITH_SOFTFLOAT */ - #if defined(CPU_i386) || defined(CPU_x86_64) /* The main motivation for dynamically creating an x86(-64) function in @@ -618,11 +1007,9 @@ static inline void set_fpucw_x87(uae_u32 m68k_cw) static void native_set_fpucw(uae_u32 m68k_cw) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { set_fpucw_softfloat(m68k_cw); } -#endif #if defined(CPU_i386) || defined(CPU_x86_64) set_fpucw_x87(m68k_cw); #endif @@ -989,24 +1376,21 @@ static void fpu_null (void) #define fp_round_to_zero(x) ((x) >= 0.0 ? floor(x) : ceil(x)) #define fp_round_to_nearest(x) ((x) >= 0.0 ? (int)((x) + 0.5) : (int)((x) - 0.5)) -static tointtype toint(fpdata *src, int size) +static tointtype to_int(fpdata *src, int size) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - if (floatx80_compare(src->fpx, fxsizes[size * 2 + 0], fxstatus) == float_relation_greater) - return floatx80_to_int32(fxsizes[size * 2 + 0], fxstatus); - if (floatx80_compare(src->fpx, fxsizes[size * 2 + 1], fxstatus) == float_relation_less) - return floatx80_to_int32(fxsizes[size * 2 + 1], fxstatus); - return floatx80_to_int32(src->fpx, fxstatus); - } else -#endif - { + if (floatx80_lt(src->fpx, fxsizes[size * 2 + 0])) + return floatx80_to_int32(fxsizes[size * 2 + 0]); + if (floatx80_le(fxsizes[size * 2 + 1], src->fpx)) + return floatx80_to_int32(fxsizes[size * 2 + 1]); + return floatx80_to_int32(src->fpx); + } else { fptype fp = src->fp; if (fp < fsizes[size * 2 + 0]) fp = fsizes[size * 2 + 0]; if (fp > fsizes[size * 2 + 1]) fp = fsizes[size * 2 + 1]; - #if defined(X86_MSVC_ASSEMBLY_FPU) +#if defined(X86_MSVC_ASSEMBLY_FPU) { fptype tmp_fp; __asm { @@ -1016,7 +1400,7 @@ static tointtype toint(fpdata *src, int size) } return (tointtype)tmp_fp; } - #else /* no X86_MSVC */ +#else /* no X86_MSVC */ { int result = (int)fp; switch (regs.fpcr & 0x30) @@ -1036,61 +1420,80 @@ static tointtype toint(fpdata *src, int size) } return result; } - #endif +#endif } } static bool fp_is_snan(fpdata *fpd) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) + if (currprefs.fpu_softfloat) { return floatx80_is_signaling_nan(fpd->fpx) != 0; -#endif - return false; + } else { + return false; + } +} +static void fp_unset_snan(fpdata *fpd) +{ + fpd->fpx.low |= LIT64(0x4000000000000000); } static bool fp_is_nan (fpdata *fpd) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) + if (currprefs.fpu_softfloat) { return floatx80_is_nan(fpd->fpx) != 0; -#endif + } else { #ifdef HAVE_ISNAN - return isnan(fpd->fp) != 0; + return isnan(fpd->fp) != 0; #else - return false; + return false; #endif + } } static bool fp_is_infinity (fpdata *fpd) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { - float_class_t fc = floatx80_class(fpd->fpx); - return fc == float_negative_inf || fc == float_positive_inf; - } -#endif + return floatx80_is_infinity(fpd->fpx) != 0; + } else { #ifdef _MSC_VER - return !_finite (fpd->fp); + return !_finite (fpd->fp); #elif defined(HAVE_ISINF) - return isinf(fpd->fp); + return isinf(fpd->fp); #else - return false; + return false; #endif + } } static bool fp_is_zero(fpdata *fpd) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) - return floatx80_compare_quiet(fpd->fpx, fxzero, fxstatus) == float_relation_equal; -#endif - return fpd->fp == 0.0; + if (currprefs.fpu_softfloat) { + return floatx80_is_zero(fpd->fpx) != 0; + } else { + return fpd->fp == 0.0; + } } static bool fp_is_neg(fpdata *fpd) { -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) - return extractFloatx80Sign(fpd->fpx) != 0; -#endif - return fpd->fp < 0.0; + if (currprefs.fpu_softfloat) { + return floatx80_is_negative(fpd->fpx) != 0; + } else { + return fpd->fp < 0.0; + } +} + +static bool fp_is_denormal(fpdata *fpd) +{ + if (currprefs.fpu_softfloat) { + return floatx80_is_denormal(fpd->fpx) != 0; + } else { + return false; + } +} +static bool fp_is_unnormal(fpdata *fpd) +{ + if (currprefs.fpu_softfloat) { + return floatx80_is_unnormal(fpd->fpx) != 0; + } else { + return false; + } } uae_u32 fpp_get_fpsr (void) @@ -1188,6 +1591,19 @@ static void to_pack (fpdata *fpd, uae_u32 *wrd) char *cp; char str[100]; + if (((wrd[0] >> 16) & 0x7fff) == 0x7fff) { + // infinity has extended exponent and all 0 packed fraction + // nans are copies bit by bit + to_exten_fmovem(fpd, wrd[0], wrd[1], wrd[2]); + return; + } + if (!(wrd[0] & 0xf) && !wrd[1] && !wrd[2]) { + // exponent is not cared about, if mantissa is zero + wrd[0] &= 0x80000000; + to_exten_fmovem(fpd, wrd[0], wrd[1], wrd[2]); + return; + } + cp = str; if (wrd[0] & 0x80000000) *cp++ = '-'; @@ -1221,14 +1637,7 @@ static void to_pack (fpdata *fpd, uae_u32 *wrd) #else sscanf (str, "%le", &d); #endif -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) { - uae_u32 wrd[3]; - from_exten_x(d, &wrd[0], &wrd[1], &wrd[2]); - softfloat_set(&fpd->fpx, wrd); - } else -#endif - fpd->fp = d; + from_native(d, fpd); } static void from_pack (fpdata *src, uae_u32 *wrd, int kfactor) @@ -1240,23 +1649,21 @@ static void from_pack (fpdata *src, uae_u32 *wrd, int kfactor) char str[100]; fptype fp; - wrd[0] = wrd[1] = wrd[2] = 0; + if (fp_is_nan (src)) { + // copied bit by bit, no conversion + from_exten(src, &wrd[0], &wrd[1], &wrd[2]); + return; + } + if (fp_is_infinity (src)) { + // extended exponent and all 0 packed fraction + from_exten(src, &wrd[0], &wrd[1], &wrd[2]); + wrd[1] = wrd[2] = 0; + return; + } - if (fp_is_nan (src) || fp_is_infinity (src)) { - wrd[0] |= (1 << 30) | (1 << 29) | (1 << 30); // YY=1 - wrd[0] |= 0xfff << 16; // Exponent=FFF - // TODO: mantissa should be set if NAN - return; - } + wrd[0] = wrd[1] = wrd[2] = 0; -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) { - uae_u32 out[3]; - softfloat_get(&src->fpx, out); - to_exten_x(&fp, out[0], out[1], out[2]); - } else -#endif - fp = src->fp; + to_native(&fp, src); #if USE_LONG_DOUBLE sprintf (str, "%#.17Le", fp); @@ -1382,30 +1789,26 @@ static void from_pack (fpdata *src, uae_u32 *wrd, int kfactor) // 68040/060 does not support denormals static bool fault_if_no_denormal_support_pre(uae_u16 opcode, uae_u16 extra, uaecptr ea, uaecptr oldpc, fpdata *fpd, int size) { -#ifdef WITH_SOFTFLOAT if (currprefs.cpu_model >= 68040 && currprefs.fpu_model && currprefs.fpu_no_unimplemented && currprefs.fpu_softfloat) { - Bit64u Sig = extractFloatx80Frac(fpd->fpx); - Bit32s Exp = extractFloatx80Exp(fpd->fpx); + bits64 Sig = extractFloatx80Frac(fpd->fpx); + bits32 Exp = extractFloatx80Exp(fpd->fpx); if (Exp == 0 && Sig != 0) { fpu_op_unimp(opcode, extra, ea, oldpc, FPU_EXP_UNIMP_DATATYPE_PRE, fpd, -1, size); return true; } } -#endif return false; } static bool fault_if_no_denormal_support_post(uae_u16 opcode, uae_u16 extra, uaecptr ea, uaecptr oldpc, fpdata *fpd, int size) { -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat && currprefs.cpu_model >= 68040 && currprefs.fpu_model && currprefs.fpu_no_unimplemented) { - Bit64u Sig = extractFloatx80Frac(fpd->fpx); - Bit32s Exp = extractFloatx80Exp(fpd->fpx); + bits64 Sig = extractFloatx80Frac(fpd->fpx); + bits32 Exp = extractFloatx80Exp(fpd->fpx); if (Exp == 0 && Sig != 0) { fpu_op_unimp(opcode, extra, ea, oldpc, FPU_EXP_UNIMP_DATATYPE_POST, fpd, -1, size); return true; } } -#endif return false; } @@ -1628,15 +2031,15 @@ static int put_fp_value (fpdata *value, uae_u32 opcode, uae_u16 extra, uaecptr o switch (size) { case 6: - m68k_dreg (regs, reg) = (uae_u32)(((toint (value, 0) & 0xff) + m68k_dreg (regs, reg) = (uae_u32)(((to_int (value, 0) & 0xff) | (m68k_dreg (regs, reg) & ~0xff))); break; case 4: - m68k_dreg (regs, reg) = (uae_u32)(((toint (value, 1) & 0xffff) + m68k_dreg (regs, reg) = (uae_u32)(((to_int (value, 1) & 0xffff) | (m68k_dreg (regs, reg) & ~0xffff))); break; case 0: - m68k_dreg (regs, reg) = (uae_u32)toint (value, 2); + m68k_dreg (regs, reg) = (uae_u32)to_int (value, 2); break; case 1: m68k_dreg (regs, reg) = from_single (value); @@ -1703,7 +2106,7 @@ static int put_fp_value (fpdata *value, uae_u32 opcode, uae_u16 extra, uaecptr o case 0: if (fault_if_no_denormal_support_post(opcode, extra, ad, oldpc, value, 2)) return 1; - x_cp_put_long(ad, (uae_u32)toint(value, 2)); + x_cp_put_long(ad, (uae_u32)to_int(value, 2)); break; case 1: if (fault_if_no_denormal_support_post(opcode, extra, ad, oldpc, value, 2)) @@ -1745,7 +2148,7 @@ static int put_fp_value (fpdata *value, uae_u32 opcode, uae_u16 extra, uaecptr o case 4: if (fault_if_no_denormal_support_post(opcode, extra, ad, oldpc, value, 2)) return 1; - x_cp_put_word(ad, (uae_s16)toint(value, 1)); + x_cp_put_word(ad, (uae_s16)to_int(value, 1)); break; case 5: { @@ -1761,7 +2164,7 @@ static int put_fp_value (fpdata *value, uae_u32 opcode, uae_u16 extra, uaecptr o case 6: if (fault_if_no_denormal_support_post(opcode, extra, ad, oldpc, value, 2)) return 1; - x_cp_put_byte(ad, (uae_s8)toint(value, 0)); + x_cp_put_byte(ad, (uae_s8)to_int(value, 0)); break; default: return 0; @@ -2468,18 +2871,6 @@ static uaecptr fmovem2fpp (uaecptr ad, uae_u32 list, int incr, int regdir) return ad; } -// round to float -static void fround (int reg) -{ -#ifdef WITH_SOFTFLOAT - if (currprefs.fpu_softfloat) { - float32 f = floatx80_to_float32(regs.fp[reg].fpx, fxstatus); - regs.fp[reg].fpx = float32_to_floatx80(f, fxstatus); - } else -#endif - regs.fp[reg].fp = (float)regs.fp[reg].fp; -} - static bool arithmetic_fp(fptype src, int reg, int extra) { bool sgl = false; @@ -2676,6 +3067,10 @@ static bool arithmetic_fp(fptype src, int reg, int extra) case 0x37: regs.fp[extra & 7].fp = cos (src); regs.fp[reg].fp = sin (src); + if (((regs.fpcr >> 6) & 3) == 1) + fp_round32(®s.fp[extra & 7]); + else if (((regs.fpcr >> 6) & 3) == 2) + fp_round64(®s.fp[extra & 7]); break; case 0x38: /* FCMP */ { @@ -2691,25 +3086,29 @@ static bool arithmetic_fp(fptype src, int reg, int extra) default: return false; } - // round to float? - if (sgl || (extra & 0x44) == 0x40 || ((regs.fpcr >> 6) & 3) == 1) - fround (reg); + + if (sgl) { + fp_roundsgl(®s.fp[reg]); + } else if ((extra & 0x44) == 0x40 || ((regs.fpcr >> 6) & 3) == 1) { + fp_round32(®s.fp[reg]); + } else if ((extra & 0x44) == 0x44 || ((regs.fpcr >> 6) & 3) == 2) { + fp_round64(®s.fp[reg]); + } + MAKE_FPSR (®s.fp[reg].fp); return true; } -#ifdef WITH_SOFTFLOAT -static bool arithmetic_softfloat(floatx80 *srcd, int reg, int extra) +static bool arithmetic_softfloat(fpdata *srcd, int reg, int extra) { - floatx80 fx = *srcd; + floatx80 fx = srcd->fpx; floatx80 f = regs.fp[reg].fpx; - int float_rounding_mode; + int8 float_rounding_mode_temp; bool sgl = false; - Bit64u q; // SNAN -> QNAN if SNAN interrupt is not enabled if (floatx80_is_signaling_nan(fx) && !(regs.fpcr & 0x4000)) { - fx.fraction |= 0x40000000; + fp_unset_snan(srcd); } switch (extra & 0x7f) @@ -2720,18 +3119,18 @@ static bool arithmetic_softfloat(floatx80 *srcd, int reg, int extra) regs.fp[reg].fpx = fx; break; case 0x01: /* FINT */ - regs.fp[reg].fpx = floatx80_round_to_int(fx, fxstatus); + regs.fp[reg].fpx = floatx80_round_to_int(fx); break; case 0x03: /* FINTRZ */ - float_rounding_mode = fxstatus.float_rounding_mode; - fxstatus.float_rounding_mode = float_round_to_zero; - regs.fp[reg].fpx = floatx80_round_to_int(fx, fxstatus); - float_rounding_mode = fxstatus.float_rounding_mode; + float_rounding_mode_temp = float_rounding_mode; + float_rounding_mode = float_round_to_zero; + regs.fp[reg].fpx = floatx80_round_to_int(fx); + float_rounding_mode = float_rounding_mode_temp; break; case 0x04: /* FSQRT */ case 0x41: /* FSSQRT */ case 0x45: /* FDSQRT */ - regs.fp[reg].fpx = floatx80_sqrt(fx, fxstatus); + regs.fp[reg].fpx = floatx80_sqrt(fx); break; case 0x18: /* FABS */ case 0x58: /* FSABS */ @@ -2747,54 +3146,54 @@ static bool arithmetic_softfloat(floatx80 *srcd, int reg, int extra) case 0x20: /* FDIV */ case 0x60: /* FSDIV */ case 0x64: /* FDDIV */ - regs.fp[reg].fpx = floatx80_div(f, fx, fxstatus); + regs.fp[reg].fpx = floatx80_div(f, fx); break; case 0x22: /* FADD */ case 0x62: /* FSADD */ case 0x66: /* FDADD */ - regs.fp[reg].fpx = floatx80_add(f, fx, fxstatus); + regs.fp[reg].fpx = floatx80_add(f, fx); break; case 0x23: /* FMUL */ case 0x63: /* FSMUL */ case 0x67: /* FDMUL */ - regs.fp[reg].fpx = floatx80_mul(f, fx, fxstatus); + regs.fp[reg].fpx = floatx80_mul(f, fx); break; case 0x24: /* FSGLDIV */ - regs.fp[reg].fpx = floatx80_div(f, fx, fxstatus); + regs.fp[reg].fpx = floatx80_div(f, fx); sgl = true; break; case 0x25: /* FREM */ - floatx80_ieee754_remainder(f, fx, regs.fp[reg].fpx, q, fxstatus); + floatx80_rem(f, fx); break; case 0x27: /* FSGLMUL */ - regs.fp[reg].fpx = floatx80_mul(f, fx, fxstatus); + regs.fp[reg].fpx = floatx80_mul(f, fx); sgl = true; break; case 0x28: /* FSUB */ case 0x68: /* FSSUB */ case 0x6c: /* FDSUB */ - regs.fp[reg].fpx = floatx80_sub(f, fx, fxstatus); + regs.fp[reg].fpx = floatx80_sub(f, fx); break; case 0x38: /* FCMP */ - f = floatx80_sub(f, fx, fxstatus); + f = floatx80_sub(f, fx); regs.fpsr = 0; MAKE_FPSR_SOFTFLOAT(f); return true; case 0x3a: /* FTST */ regs.fpsr = 0; - MAKE_FPSR_SOFTFLOAT(f); + MAKE_FPSR_SOFTFLOAT(fx); return true; case 0x1d: /* FCOS */ - fcos(f, fxstatus); + floatx80_fcos(&f); regs.fp[reg].fpx = f; break; case 0x0e: /* FSIN */ - fsin(f, fxstatus); + floatx80_fsin(&f); regs.fp[reg].fpx = f; break; case 0x0f: /* FTAN */ - ftan(f, fxstatus); + floatx80_ftan(&f); regs.fp[reg].fpx = f; break; case 0x30: /* FSINCOS */ @@ -2805,11 +3204,31 @@ static bool arithmetic_softfloat(floatx80 *srcd, int reg, int extra) case 0x35: /* FSINCOS */ case 0x36: /* FSINCOS */ case 0x37: /* FSINCOS */ - fsincos(f, ®s.fp[extra & 7].fpx, ®s.fp[reg].fpx, fxstatus); + floatx80_fsincos(f, ®s.fp[extra & 7].fpx, ®s.fp[reg].fpx); + if (((regs.fpcr >> 6) & 3) == 1) + fp_round32(®s.fp[extra & 7]); + else if (((regs.fpcr >> 6) & 3) == 2) + fp_round64(®s.fp[extra & 7]); + break; + case 0x14: /* FLOGN */ + regs.fp[reg].fpx = floatx80_flogn(f); + break; + case 0x15: /* FLOG10 */ + regs.fp[reg].fpx = floatx80_flog10(f); + break; + case 0x16: /* FLOG2 */ + regs.fp[reg].fpx = floatx80_flog2(f); break; - - // some of following are supported by softfloat, later.. case 0x06: /* FLOGNP1 */ + regs.fp[reg].fpx = floatx80_flognp1(f); + break; + case 0x1e: /* FGETEXP */ + regs.fp[reg].fpx = floatx80_getexp(f); + break; + case 0x1f: /* FGETMAN */ + regs.fp[reg].fpx = floatx80_getman(f); + break; + case 0x08: /* FETOXM1 */ case 0x09: /* FTANH */ case 0x0a: /* FATAN */ @@ -2818,35 +3237,33 @@ static bool arithmetic_softfloat(floatx80 *srcd, int reg, int extra) case 0x10: /* FETOX */ case 0x11: /* FTWOTOX */ case 0x12: /* FTENTOX */ - case 0x14: /* FLOGN */ - case 0x15: /* FLOG10 */ - case 0x16: /* FLOG2 */ case 0x19: /* FCOSH */ case 0x1c: /* FACOS */ - case 0x1e: /* FGETEXP */ - case 0x1f: /* FGETMAN */ { // This is horribly ineffective.. - fptype fp; - uae_u32 out[3]; - // convert softfloat to raw words - softfloat_get(&fx, out); - // convert to double/long double - to_exten_x(&fp, out[0], out[1], out[2]); + fptype fpa; + fpdata fpdx = { 0 }; + fpdx.fpx = fx; + to_native(&fpa, &fpdx); // emulate instruction using normal fpu code - if (!arithmetic_fp(fp, reg, extra)) + if (!arithmetic_fp(fpa, reg, extra)) return false; - // convert back to raw - from_exten_x(regs.fp[reg].fp, &out[0], &out[1], &out[2]); - // convert to softfloat internal format - softfloat_set(®s.fp[reg].fpx, out); + from_native(fpa, ®s.fp[reg]); MAKE_FPSR_SOFTFLOAT(regs.fp[reg].fpx); } break; } + + if (sgl) { + fp_roundsgl(®s.fp[reg]); + } else if ((extra & 0x44) == 0x40 || ((regs.fpcr >> 6) & 3) == 1) { + fp_round32(®s.fp[reg]); + } else if ((extra & 0x44) == 0x44 || ((regs.fpcr >> 6) & 3) == 2) { + fp_round64(®s.fp[reg]); + } + return true; } -#endif static void fpuop_arithmetic2 (uae_u32 opcode, uae_u16 extra) { @@ -3102,11 +3519,9 @@ static void fpuop_arithmetic2 (uae_u32 opcode, uae_u16 extra) regs.fpiar = pc; CLEAR_STATUS (); -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) - v = arithmetic_softfloat(&srcd.fpx, reg, extra); + v = arithmetic_softfloat(&srcd, reg, extra); else -#endif v = arithmetic_fp(srcd.fp, reg, extra); if (!v) fpu_noinst (opcode, pc); @@ -3126,7 +3541,6 @@ void fpuop_arithmetic (uae_u32 opcode, uae_u16 extra) if (fpu_mmu_fixup) { mmufixup[0].reg = -1; } -#ifdef WITH_SOFTFLOAT if (currprefs.fpu_softfloat) { // Any exception status bit and matching exception enable bits set? if ((regs.fpcr >> 8) & (regs.fpsr >> 8)) { @@ -3144,7 +3558,6 @@ void fpuop_arithmetic (uae_u32 opcode, uae_u16 extra) write_log (_T("FPU exception: %08x %d!\n"), regs.fpsr, vector); } } -#endif } void fpu_reset (void) @@ -3159,7 +3572,6 @@ void fpu_reset (void) native_set_fpucw (regs.fpcr); fpux_restore (NULL); -#ifdef WITH_SOFTFLOAT fxsizes[0] = int32_to_floatx80(-128); fxsizes[1] = int32_to_floatx80(127); fxsizes[2] = int32_to_floatx80(-32768); @@ -3172,7 +3584,6 @@ void fpu_reset (void) fx_1e2 = int32_to_floatx80(100); fx_1e4 = int32_to_floatx80(10000); fx_1e8 = int32_to_floatx80(100000000); -#endif } uae_u8 *restore_fpu (uae_u8 *src) diff --git a/include/fpp-softfloat.h b/include/fpp-softfloat.h new file mode 100644 index 00000000..3bec3df9 --- /dev/null +++ b/include/fpp-softfloat.h @@ -0,0 +1,541 @@ +/* + * UAE - The Un*x Amiga Emulator + * + * MC68881 emulation + * + * Conversion routines for hosts knowing floating point format. + * + * Copyright 1996 Herman ten Brugge + * Modified 2005 Peter Keunecke + */ + +#ifndef FPP_H +#define FPP_H + +#define __USE_ISOC9X /* We might be able to pick up a NaN */ + +#include +#include + +#include + + +#define FPCR_ROUNDING_MODE 0x00000030 +#define FPCR_ROUND_NEAR 0x00000000 +#define FPCR_ROUND_ZERO 0x00000010 +#define FPCR_ROUND_MINF 0x00000020 +#define FPCR_ROUND_PINF 0x00000030 + +#define FPCR_ROUNDING_PRECISION 0x000000c0 +#define FPCR_PRECISION_SINGLE 0x00000040 +#define FPCR_PRECISION_DOUBLE 0x00000080 +#define FPCR_PRECISION_EXTENDED 0x00000000 + +extern uae_u32 fpp_get_fpsr (void); +extern void to_single(fptype *fp, uae_u32 wrd1); +extern void to_double(fptype *fp, uae_u32 wrd1, uae_u32 wrd2); +extern void to_exten(fptype *fp, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3); +extern void normalize_exten (uae_u32 *wrd1, uae_u32 *wrd2, uae_u32 *wrd3); + + +/* Functions for setting host/library modes and getting status */ +STATIC_INLINE void set_fp_mode(uae_u32 mode_control) +{ + switch(mode_control & FPCR_ROUNDING_PRECISION) { + case FPCR_PRECISION_SINGLE: // S + //floatx80_rounding_precision = 32; + break; + case FPCR_PRECISION_DOUBLE: // D + //floatx80_rounding_precision = 64; + break; + case FPCR_PRECISION_EXTENDED: // X + default: // undefined + //floatx80_rounding_precision = 80; + break; + } + floatx80_rounding_precision = 80; + + switch(mode_control & FPCR_ROUNDING_MODE) { + case FPCR_ROUND_NEAR: // to neareset + float_rounding_mode = float_round_nearest_even; + break; + case FPCR_ROUND_ZERO: // to zero + float_rounding_mode = float_round_to_zero; + break; + case FPCR_ROUND_MINF: // to minus + float_rounding_mode = float_round_down; + break; + case FPCR_ROUND_PINF: // to plus + float_rounding_mode = float_round_up; + break; + } + return; +} +STATIC_INLINE void get_fp_status(uae_u32 *status) +{ + if (float_exception_flags & float_flag_invalid) + *status |= 0x2000; + if (float_exception_flags & float_flag_divbyzero) + *status |= 0x0400; + if (float_exception_flags & float_flag_overflow) + *status |= 0x1000; + if (float_exception_flags & float_flag_underflow) + *status |= 0x0800; + if (float_exception_flags & float_flag_inexact) + *status |= 0x0200; +} +STATIC_INLINE void clear_fp_status(void) +{ + float_exception_flags = 0; +} + +/* Helper functions */ +STATIC_INLINE const char *fp_print(fptype *fx) +{ + static char fs[32]; + bool n, u, d; + long double result = 0.0; + int i; + + n = floatx80_is_negative(*fx); + u = floatx80_is_unnormal(*fx); + d = floatx80_is_denormal(*fx); + + if (floatx80_is_zero(*fx)) { + sprintf(fs, "%c%#.17Le%s%s", n?'-':'+', (long double) 0.0, u?"U":"", d?"D":""); + } else if (floatx80_is_infinity(*fx)) { + sprintf(fs, "%c%s", n?'-':'+', "inf"); + } else if (floatx80_is_signaling_nan(*fx)) { + sprintf(fs, "%c%s", n?'-':'+', "snan"); + } else if (floatx80_is_nan(*fx)) { + sprintf(fs, "%c%s", n?'-':'+', "nan"); + } else { + for (i = 63; i >= 0; i--) { + if (fx->low & (((uae_u64)1)<high&0x7FFF) - 0x3FFF); + sprintf(fs, "%c%#.17Le%s%s", n?'-':'+', result, u?"U":"", d?"D":""); + } + + return fs; +} + +STATIC_INLINE void softfloat_set(fptype *fx, uae_u32 *f) +{ + fx->high = (uae_u16)(f[0] >> 16); + fx->low = ((uae_u64)f[1] << 32) | f[2]; +} + +STATIC_INLINE void softfloat_get(fptype *fx, uae_u32 *f) +{ + f[0] = (uae_u32)(fx->high << 16); + f[1] = fx->low >> 32; + f[2] = (uae_u32)fx->low; +} + +/* Functions for detecting float type */ +STATIC_INLINE bool fp_is_snan(fptype *fp) +{ + return floatx80_is_signaling_nan(*fp) != 0; +} +STATIC_INLINE void fp_unset_snan(fptype *fp) +{ + fp->low |= LIT64(0x4000000000000000); +} +STATIC_INLINE bool fp_is_nan (fptype *fp) +{ + return floatx80_is_nan(*fp) != 0; +} +STATIC_INLINE bool fp_is_infinity (fptype *fp) +{ + return floatx80_is_infinity(*fp) != 0; +} +STATIC_INLINE bool fp_is_zero(fptype *fp) +{ + return floatx80_is_zero(*fp) != 0; +} +STATIC_INLINE bool fp_is_neg(fptype *fp) +{ + return floatx80_is_negative(*fp) != 0; +} +STATIC_INLINE bool fp_is_denormal(fptype *fp) +{ + return floatx80_is_denormal(*fp) != 0; +} +STATIC_INLINE bool fp_is_unnormal(fptype *fp) +{ + return floatx80_is_unnormal(*fp) != 0; +} + +/* Functions for converting between float formats */ +static const long double twoto32 = 4294967296.0; + +STATIC_INLINE void to_native(long double *fp, fptype fpx) +{ + int expon; + long double frac; + + expon = fpx.high & 0x7fff; + + if (floatx80_is_zero(fpx)) { + *fp = floatx80_is_negative(fpx) ? -0.0 : +0.0; + return; + } + if (floatx80_is_nan(fpx)) { + *fp = sqrtl(-1); + return; + } + if (floatx80_is_infinity(fpx)) { + *fp = floatx80_is_negative(fpx) ? logl(0.0) : (1.0/0.0); + return; + } + + frac = (long double)fpx.low / (long double)(twoto32 * 2147483648.0); + if (floatx80_is_negative(fpx)) + frac = -frac; + *fp = ldexpl (frac, expon - 16383); +} + +STATIC_INLINE void from_native(long double fp, fptype *fpx) +{ + int expon; + long double frac; + + if (signbit(fp)) + fpx->high = 0x8000; + else + fpx->high = 0x0000; + + if (isnan(fp)) { + fpx->high |= 0x7fff; + fpx->low = LIT64(0xffffffffffffffff); + return; + } + if (isinf(fp)) { + fpx->high |= 0x7fff; + fpx->low = LIT64(0x0000000000000000); + return; + } + if (fp == 0.0) { + fpx->low = LIT64(0x0000000000000000); + return; + } + if (fp < 0.0) + fp = -fp; + + frac = frexpl (fp, &expon); + frac += 0.5 / (twoto32 * twoto32); + if (frac >= 1.0) { + frac /= 2.0; + expon++; + } + fpx->high |= (expon + 16383 - 1) & 0x7fff; + fpx->low = (bits64)(frac * (long double)(twoto32 * twoto32)); + + while (!(fpx->low & LIT64( 0x8000000000000000))) { + if (fpx->high == 0) { + float_raise( float_flag_denormal ); + break; + } + fpx->low <<= 1; + fpx->high--; + } +} + +STATIC_INLINE void to_single_xn(fptype *fp, uae_u32 wrd1) +{ + float32 f = wrd1; + *fp = float32_to_floatx80(f); // automatically fix denormals +} +STATIC_INLINE void to_single_x(fptype *fp, uae_u32 wrd1) +{ + float32 f = wrd1; + *fp = float32_to_floatx80_allowunnormal(f); +} +STATIC_INLINE uae_u32 from_single_x(fptype *fp) +{ + float32 f = floatx80_to_float32(*fp); + return f; +} + +STATIC_INLINE void to_double_xn(fptype *fp, uae_u32 wrd1, uae_u32 wrd2) +{ + float64 f = ((float64)wrd1 << 32) | wrd2; + *fp = float64_to_floatx80(f); // automatically fix denormals +} +STATIC_INLINE void to_double_x(fptype *fp, uae_u32 wrd1, uae_u32 wrd2) +{ + float64 f = ((float64)wrd1 << 32) | wrd2; + *fp = float64_to_floatx80_allowunnormal(f); +} +STATIC_INLINE void from_double_x(fptype *fp, uae_u32 *wrd1, uae_u32 *wrd2) +{ + float64 f = floatx80_to_float64(*fp); + *wrd1 = f >> 32; + *wrd2 = (uae_u32)f; +} + +STATIC_INLINE void to_exten_x(fptype *fp, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) +{ + uae_u32 wrd[3] = { wrd1, wrd2, wrd3 }; + softfloat_set(fp, wrd); +} +STATIC_INLINE void from_exten_x(fptype *fp, uae_u32 *wrd1, uae_u32 *wrd2, uae_u32 *wrd3) +{ + uae_u32 wrd[3]; + softfloat_get(fp, wrd); + *wrd1 = wrd[0]; + *wrd2 = wrd[1]; + *wrd3 = wrd[2]; +} + +STATIC_INLINE uae_s64 to_int(fptype *src, int size) +{ + static fptype fxsizes[6]; + static bool setup = false; + if (!setup) { + fxsizes[0] = int32_to_floatx80(-128); + fxsizes[1] = int32_to_floatx80(127); + fxsizes[2] = int32_to_floatx80(-32768); + fxsizes[3] = int32_to_floatx80(32767); + fxsizes[4] = int32_to_floatx80(-2147483648); + fxsizes[5] = int32_to_floatx80(2147483647); + setup = true; + } + + if (floatx80_lt(*src, fxsizes[size * 2 + 0])) { + return floatx80_to_int32(fxsizes[size * 2 + 0]); + } + if (floatx80_le(fxsizes[size * 2 + 1], *src)) { + return floatx80_to_int32(fxsizes[size * 2 + 1]); + } + return floatx80_to_int32(*src); +} +STATIC_INLINE fptype from_int(uae_s32 src) +{ + return int32_to_floatx80(src); +} + +/* Functions for rounding */ + +// round to float with extended precision exponent +STATIC_INLINE void fp_roundsgl(fptype *fp) +{ + *fp = floatx80_round32(*fp); +} + +// round to float +STATIC_INLINE void fp_round32(fptype *fp) +{ + float32 f = floatx80_to_float32(*fp); + *fp = float32_to_floatx80(f); +} + +// round to double +STATIC_INLINE void fp_round64(fptype *fp) +{ + float64 f = floatx80_to_float64(*fp); + *fp = float64_to_floatx80(f); +} + +/* Arithmetic functions */ + +STATIC_INLINE fptype fp_int(fptype a) +{ + return floatx80_round_to_int(a); +} +STATIC_INLINE fptype fp_intrz(fptype a) +{ + int8 save_rm; + fptype b; + save_rm = float_rounding_mode; + float_rounding_mode = float_round_to_zero; + b = floatx80_round_to_int(a); + float_rounding_mode = save_rm; + return b; +} +STATIC_INLINE fptype fp_sqrt(fptype a) +{ + return floatx80_sqrt(a); +} +STATIC_INLINE fptype fp_lognp1(fptype a) +{ + return floatx80_flognp1(a); +} +STATIC_INLINE fptype fp_sin(fptype a) +{ + floatx80_fsin(&a); + return a; +} +STATIC_INLINE fptype fp_tan(fptype a) +{ + floatx80_ftan(&a); + return a; +} +STATIC_INLINE fptype fp_logn(fptype a) +{ + return floatx80_flogn(a); +} +STATIC_INLINE fptype fp_log10(fptype a) +{ + return floatx80_flog10(a); +} +STATIC_INLINE fptype fp_log2(fptype a) +{ + return floatx80_flog2(a); +} +STATIC_INLINE fptype fp_abs(fptype a) +{ + return floatx80_abs(a); +} +STATIC_INLINE fptype fp_neg(fptype a) +{ + return floatx80_chs(a); +} +STATIC_INLINE fptype fp_cos(fptype a) +{ + floatx80_fcos(&a); + return a; +} +STATIC_INLINE fptype fp_getexp(fptype a) +{ + return floatx80_getexp(a); +} +STATIC_INLINE fptype fp_getman(fptype a) +{ + return floatx80_getman(a); +} +STATIC_INLINE fptype fp_div(fptype a, fptype b) +{ + return floatx80_div(a, b); +} +STATIC_INLINE fptype fp_add(fptype a, fptype b) +{ + return floatx80_add(a, b); +} +STATIC_INLINE fptype fp_mul(fptype a, fptype b) +{ + return floatx80_mul(a, b); +} +STATIC_INLINE fptype fp_rem(fptype a, fptype b) +{ + return floatx80_rem(a, b); +} +STATIC_INLINE fptype fp_scale(fptype a, fptype b) +{ + return floatx80_scale(a, b); +} +STATIC_INLINE fptype fp_sub(fptype a, fptype b) +{ + return floatx80_sub(a, b); +} + +/* FIXME: create softfloat functions for following arithmetics */ +#define fp_round_to_minus_infinity(x) floor(x) +#define fp_round_to_plus_infinity(x) ceil(x) +#define fp_round_to_zero(x) ((x) >= 0.0 ? floor(x) : ceil(x)) +#define fp_round_to_nearest(x) ((x) >= 0.0 ? (int)((x) + 0.5) : (int)((x) - 0.5)) + +STATIC_INLINE fptype fp_sinh(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = sinhl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_etoxm1(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = expl(fpa) - 1.0; + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_tanh(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = tanhl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_atan(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = atanl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_asin(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = asinl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_atanh(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = atanhl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_etox(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = expl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_twotox(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = powl(2.0, fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_tentox(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = powl(10.0, fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_cosh(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = coshl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_acos(fptype a) +{ + long double fpa; + to_native(&fpa, a); + fpa = acosl(fpa); + from_native(fpa, &a); + return a; +} +STATIC_INLINE fptype fp_mod(fptype a, fptype b) +{ + long double fpa, fpb; + long double quot; + to_native(&fpa, a); + to_native(&fpb, a); + quot = fp_round_to_zero(fpa / fpb); + fpa -= quot * fpb; + from_native(fpa, &a); + return a; +} + +#endif diff --git a/od-win32/md-fpp.h b/od-win32/md-fpp.h index 45058c0e..64041233 100644 --- a/od-win32/md-fpp.h +++ b/od-win32/md-fpp.h @@ -25,7 +25,9 @@ extern void to_single(fpdata *fpd, uae_u32 value); extern void to_double(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2); extern void to_exten(fpdata *fpd, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3); +extern const TCHAR *fp_print(fpdata *fpd); +#if 0 STATIC_INLINE void exten_zeronormalize(uae_u32 *pwrd1, uae_u32 *pwrd2, uae_u32 *pwrd3) { uae_u32 wrd1 = *pwrd1; @@ -198,7 +200,6 @@ STATIC_INLINE void from_double_x(double src, uae_u32 * wrd1, uae_u32 * wrd2) } #endif -static const double twoto32 = 4294967296.0; #ifndef HAVE_to_exten #define HAVE_to_exten STATIC_INLINE void to_exten_x(fptype *fp, uae_u32 wrd1, uae_u32 wrd2, uae_u32 wrd3) @@ -248,3 +249,4 @@ STATIC_INLINE void from_exten_x(fptype fp, uae_u32 * wrd1, uae_u32 * wrd2, uae_u *wrd3 = (uae_u32) ((frac * twoto32 - *wrd2) * twoto32); } #endif +#endif diff --git a/od-win32/sysconfig.h b/od-win32/sysconfig.h index f887120b..baba2e57 100644 --- a/od-win32/sysconfig.h +++ b/od-win32/sysconfig.h @@ -52,7 +52,6 @@ #define UAESERIAL /* uaeserial.device emulation */ #define FPUEMU /* FPU emulation */ #define FPU_UAE -#define WITH_SOFTFLOAT #define MMUEMU /* Aranym 68040 MMU */ #define FULLMMU /* Aranym 68040 MMU */ #define CPUEMU_0 /* generic 680x0 emulation */ diff --git a/softfloat/README.txt b/softfloat/README.txt new file mode 100644 index 00000000..9500d25e --- /dev/null +++ b/softfloat/README.txt @@ -0,0 +1,78 @@ +MAME note: this package is derived from the following original SoftFloat +package and has been "re-packaged" to work with MAME's conventions and +build system. The source files come from bits64/ and bits64/templates +in the original distribution as MAME requires a compiler with a 64-bit +integer type. + + +Package Overview for SoftFloat Release 2b + +John R. Hauser +2002 May 27 + + +---------------------------------------------------------------------------- +Overview + +SoftFloat is a software implementation of floating-point that conforms to +the IEC/IEEE Standard for Binary Floating-Point Arithmetic. SoftFloat is +distributed in the form of C source code. Compiling the SoftFloat sources +generates two things: + +-- A SoftFloat object file (typically `softfloat.o') containing the complete + set of IEC/IEEE floating-point routines. + +-- A `timesoftfloat' program for evaluating the speed of the SoftFloat + routines. (The SoftFloat module is linked into this program.) + +The SoftFloat package is documented in four text files: + + SoftFloat.txt Documentation for using the SoftFloat functions. + SoftFloat-source.txt Documentation for compiling SoftFloat. + SoftFloat-history.txt History of major changes to SoftFloat. + timesoftfloat.txt Documentation for using `timesoftfloat'. + +Other files in the package comprise the source code for SoftFloat. + +Please be aware that some work is involved in porting this software to other +targets. It is not just a matter of getting `make' to complete without +error messages. I would have written the code that way if I could, but +there are fundamental differences between systems that can't be hidden. +You should not attempt to compile SoftFloat without first reading both +`SoftFloat.txt' and `SoftFloat-source.txt'. + + +---------------------------------------------------------------------------- +Legal Notice + +SoftFloat was written by me, John R. Hauser. This work was made possible in +part by the International Computer Science Institute, located at Suite 600, +1947 Center Street, Berkeley, California 94704. Funding was partially +provided by the National Science Foundation under grant MIP-9311980. The +original version of this code was written as part of a project to build +a fixed-point vector processor in collaboration with the University of +California at Berkeley, overseen by Profs. Nelson Morgan and John Wawrzynek. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort +has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT +TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO +PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL +LOSSES, COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO +FURTHERMORE EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER +SCIENCE INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, +COSTS, OR OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE +SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, provided +that the minimal documentation requirements stated in the source code are +satisfied. + + +---------------------------------------------------------------------------- +Contact Information + +At the time of this writing, the most up-to-date information about +SoftFloat and the latest release can be found at the Web page `http:// +www.cs.berkeley.edu/~jhauser/arithmetic/SoftFloat.html'. + + diff --git a/softfloat/SOFTFLOAT-MACROS.H b/softfloat/SOFTFLOAT-MACROS.H new file mode 100644 index 00000000..4607d346 --- /dev/null +++ b/softfloat/SOFTFLOAT-MACROS.H @@ -0,0 +1,743 @@ + +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 32, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void shift32RightJamming( bits32 a, int16 count, bits32 *zPtr ) +{ + bits32 z; + + if ( count == 0 ) { + z = a; + } + else if ( count < 32 ) { + z = ( a>>count ) | ( ( a<<( ( - count ) & 31 ) ) != 0 ); + } + else { + z = ( a != 0 ); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Shifts `a' right by the number of bits given in `count'. If any nonzero +| bits are shifted off, they are ``jammed'' into the least significant bit of +| the result by setting the least significant bit to 1. The value of `count' +| can be arbitrarily large; in particular, if `count' is greater than 64, the +| result will be either 0 or 1, depending on whether `a' is zero or nonzero. +| The result is stored in the location pointed to by `zPtr'. +*----------------------------------------------------------------------------*/ + +INLINE void shift64RightJamming( bits64 a, int16 count, bits64 *zPtr ) +{ + bits64 z; + + if ( count == 0 ) { + z = a; + } + else if ( count < 64 ) { + z = ( a>>count ) | ( ( a<<( ( - count ) & 63 ) ) != 0 ); + } + else { + z = ( a != 0 ); + } + *zPtr = z; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by 64 +| _plus_ the number of bits given in `count'. The shifted result is at most +| 64 nonzero bits; this is stored at the location pointed to by `z0Ptr'. The +| bits shifted off form a second 64-bit result as follows: The _last_ bit +| shifted off is the most-significant bit of the extra result, and the other +| 63 bits of the extra result are all zero if and only if _all_but_the_last_ +| bits shifted off were all zero. This extra result is stored in the location +| pointed to by `z1Ptr'. The value of `count' can be arbitrarily large. +| (This routine makes more sense if `a0' and `a1' are considered to form +| a fixed-point value with binary point between `a0' and `a1'. This fixed- +| point value is shifted right by the number of bits given in `count', and +| the integer part of the result is returned at the location pointed to by +| `z0Ptr'. The fractional part of the result may be slightly corrupted as +| described above, and is returned at the location pointed to by `z1Ptr'.) +*----------------------------------------------------------------------------*/ + +INLINE void + shift64ExtraRightJamming( + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + bits64 z0, z1; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 64 ) { + z1 = ( a0<>count; + } + else { + if ( count == 64 ) { + z1 = a0 | ( a1 != 0 ); + } + else { + z1 = ( ( a0 | a1 ) != 0 ); + } + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' can be arbitrarily large; in particular, if `count' is greater +| than 128, the result will be 0. The result is broken into two 64-bit pieces +| which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shift128Right( + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + bits64 z0, z1; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 64 ) { + z1 = ( a0<>count ); + z0 = a0>>count; + } + else { + z1 = ( count < 128 ) ? ( a0>>( count & 63 ) ) : 0; + z0 = 0; + } + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' right by the +| number of bits given in `count'. If any nonzero bits are shifted off, they +| are ``jammed'' into the least significant bit of the result by setting the +| least significant bit to 1. The value of `count' can be arbitrarily large; +| in particular, if `count' is greater than 128, the result will be either +| 0 or 1, depending on whether the concatenation of `a0' and `a1' is zero or +| nonzero. The result is broken into two 64-bit pieces which are stored at +| the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shift128RightJamming( + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + bits64 z0, z1; + int8 negCount = ( - count ) & 63; + + if ( count == 0 ) { + z1 = a1; + z0 = a0; + } + else if ( count < 64 ) { + z1 = ( a0<>count ) | ( ( a1<>count; + } + else { + if ( count == 64 ) { + z1 = a0 | ( a1 != 0 ); + } + else if ( count < 128 ) { + z1 = ( a0>>( count & 63 ) ) | ( ( ( a0<>count ); + z0 = a0>>count; + } + else { + if ( count == 64 ) { + z2 = a1; + z1 = a0; + } + else { + a2 |= a1; + if ( count < 128 ) { + z2 = a0<>( count & 63 ); + } + else { + z2 = ( count == 128 ) ? a0 : ( a0 != 0 ); + z1 = 0; + } + } + z0 = 0; + } + z2 |= ( a2 != 0 ); + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Shifts the 128-bit value formed by concatenating `a0' and `a1' left by the +| number of bits given in `count'. Any bits shifted off are lost. The value +| of `count' must be less than 64. The result is broken into two 64-bit +| pieces which are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shortShift128Left( + bits64 a0, bits64 a1, int16 count, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + + *z1Ptr = a1<>( ( - count ) & 63 ) ); + +} + +/*---------------------------------------------------------------------------- +| Shifts the 192-bit value formed by concatenating `a0', `a1', and `a2' left +| by the number of bits given in `count'. Any bits shifted off are lost. +| The value of `count' must be less than 64. The result is broken into three +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + shortShift192Left( + bits64 a0, + bits64 a1, + bits64 a2, + int16 count, + bits64 *z0Ptr, + bits64 *z1Ptr, + bits64 *z2Ptr + ) +{ + bits64 z0, z1, z2; + int8 negCount; + + z2 = a2<>negCount; + z0 |= a1>>negCount; + } + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Adds the 128-bit value formed by concatenating `a0' and `a1' to the 128-bit +| value formed by concatenating `b0' and `b1'. Addition is modulo 2^128, so +| any carry out is lost. The result is broken into two 64-bit pieces which +| are stored at the locations pointed to by `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + add128( + bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + bits64 z1; + + z1 = a1 + b1; + *z1Ptr = z1; + *z0Ptr = a0 + b0 + ( z1 < a1 ); + +} + +/*---------------------------------------------------------------------------- +| Adds the 192-bit value formed by concatenating `a0', `a1', and `a2' to the +| 192-bit value formed by concatenating `b0', `b1', and `b2'. Addition is +| modulo 2^192, so any carry out is lost. The result is broken into three +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr', +| `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + add192( + bits64 a0, + bits64 a1, + bits64 a2, + bits64 b0, + bits64 b1, + bits64 b2, + bits64 *z0Ptr, + bits64 *z1Ptr, + bits64 *z2Ptr + ) +{ + bits64 z0, z1, z2; + uint8 carry0, carry1; + + z2 = a2 + b2; + carry1 = ( z2 < a2 ); + z1 = a1 + b1; + carry0 = ( z1 < a1 ); + z0 = a0 + b0; + z1 += carry1; + z0 += ( z1 < carry1 ); + z0 += carry0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 128-bit value formed by concatenating `b0' and `b1' from the +| 128-bit value formed by concatenating `a0' and `a1'. Subtraction is modulo +| 2^128, so any borrow out (carry out) is lost. The result is broken into two +| 64-bit pieces which are stored at the locations pointed to by `z0Ptr' and +| `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + sub128( + bits64 a0, bits64 a1, bits64 b0, bits64 b1, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + + *z1Ptr = a1 - b1; + *z0Ptr = a0 - b0 - ( a1 < b1 ); + +} + +/*---------------------------------------------------------------------------- +| Subtracts the 192-bit value formed by concatenating `b0', `b1', and `b2' +| from the 192-bit value formed by concatenating `a0', `a1', and `a2'. +| Subtraction is modulo 2^192, so any borrow out (carry out) is lost. The +| result is broken into three 64-bit pieces which are stored at the locations +| pointed to by `z0Ptr', `z1Ptr', and `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + sub192( + bits64 a0, + bits64 a1, + bits64 a2, + bits64 b0, + bits64 b1, + bits64 b2, + bits64 *z0Ptr, + bits64 *z1Ptr, + bits64 *z2Ptr + ) +{ + bits64 z0, z1, z2; + uint8 borrow0, borrow1; + + z2 = a2 - b2; + borrow1 = ( a2 < b2 ); + z1 = a1 - b1; + borrow0 = ( a1 < b1 ); + z0 = a0 - b0; + z0 -= ( z1 < borrow1 ); + z1 -= borrow1; + z0 -= borrow0; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies `a' by `b' to obtain a 128-bit product. The product is broken +| into two 64-bit pieces which are stored at the locations pointed to by +| `z0Ptr' and `z1Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void mul64To128( bits64 a, bits64 b, bits64 *z0Ptr, bits64 *z1Ptr ) +{ + bits32 aHigh, aLow, bHigh, bLow; + bits64 z0, zMiddleA, zMiddleB, z1; + + aLow = (bits32)a; + aHigh = a>>32; + bLow = (bits32)b; + bHigh = b>>32; + z1 = ( (bits64) aLow ) * bLow; + zMiddleA = ( (bits64) aLow ) * bHigh; + zMiddleB = ( (bits64) aHigh ) * bLow; + z0 = ( (bits64) aHigh ) * bHigh; + zMiddleA += zMiddleB; + z0 += ( ( (bits64) ( zMiddleA < zMiddleB ) )<<32 ) + ( zMiddleA>>32 ); + zMiddleA <<= 32; + z1 += zMiddleA; + z0 += ( z1 < zMiddleA ); + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' by +| `b' to obtain a 192-bit product. The product is broken into three 64-bit +| pieces which are stored at the locations pointed to by `z0Ptr', `z1Ptr', and +| `z2Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + mul128By64To192( + bits64 a0, + bits64 a1, + bits64 b, + bits64 *z0Ptr, + bits64 *z1Ptr, + bits64 *z2Ptr + ) +{ + bits64 z0, z1, z2, more1; + + mul64To128( a1, b, &z1, &z2 ); + mul64To128( a0, b, &z0, &more1 ); + add128( z0, more1, 0, z1, &z0, &z1 ); + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Multiplies the 128-bit value formed by concatenating `a0' and `a1' to the +| 128-bit value formed by concatenating `b0' and `b1' to obtain a 256-bit +| product. The product is broken into four 64-bit pieces which are stored at +| the locations pointed to by `z0Ptr', `z1Ptr', `z2Ptr', and `z3Ptr'. +*----------------------------------------------------------------------------*/ + +INLINE void + mul128To256( + bits64 a0, + bits64 a1, + bits64 b0, + bits64 b1, + bits64 *z0Ptr, + bits64 *z1Ptr, + bits64 *z2Ptr, + bits64 *z3Ptr + ) +{ + bits64 z0, z1, z2, z3; + bits64 more1, more2; + + mul64To128( a1, b1, &z2, &z3 ); + mul64To128( a1, b0, &z1, &more2 ); + add128( z1, more2, 0, z2, &z1, &z2 ); + mul64To128( a0, b0, &z0, &more1 ); + add128( z0, more1, 0, z1, &z0, &z1 ); + mul64To128( a0, b1, &more1, &more2 ); + add128( more1, more2, 0, z2, &more1, &z2 ); + add128( z0, z1, 0, more1, &z0, &z1 ); + *z3Ptr = z3; + *z2Ptr = z2; + *z1Ptr = z1; + *z0Ptr = z0; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the 64-bit integer quotient obtained by dividing +| `b' into the 128-bit value formed by concatenating `a0' and `a1'. The +| divisor `b' must be at least 2^63. If q is the exact quotient truncated +| toward zero, the approximation returned lies between q and q + 2 inclusive. +| If the exact quotient q is larger than 64 bits, the maximum positive 64-bit +| unsigned integer is returned. +*----------------------------------------------------------------------------*/ + +INLINE bits64 estimateDiv128To64( bits64 a0, bits64 a1, bits64 b ) +{ + bits64 b0, b1; + bits64 rem0, rem1, term0, term1; + bits64 z; + + if ( b <= a0 ) return LIT64( 0xFFFFFFFFFFFFFFFF ); + b0 = b>>32; + z = ( b0<<32 <= a0 ) ? LIT64( 0xFFFFFFFF00000000 ) : ( a0 / b0 )<<32; + mul64To128( b, z, &term0, &term1 ); + sub128( a0, a1, term0, term1, &rem0, &rem1 ); + while ( ( (sbits64) rem0 ) < 0 ) { + z -= LIT64( 0x100000000 ); + b1 = b<<32; + add128( rem0, rem1, b0, b1, &rem0, &rem1 ); + } + rem0 = ( rem0<<32 ) | ( rem1>>32 ); + z |= ( b0<<32 <= rem0 ) ? 0xFFFFFFFF : rem0 / b0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns an approximation to the square root of the 32-bit significand given +| by `a'. Considered as an integer, `a' must be at least 2^31. If bit 0 of +| `aExp' (the least significant bit) is 1, the integer returned approximates +| 2^31*sqrt(`a'/2^31), where `a' is considered an integer. If bit 0 of `aExp' +| is 0, the integer returned approximates 2^31*sqrt(`a'/2^30). In either +| case, the approximation returned lies strictly within +/-2 of the exact +| value. +*----------------------------------------------------------------------------*/ + +INLINE bits32 estimateSqrt32( int16 aExp, bits32 a ) +{ + static const bits16 sqrtOddAdjustments[] = { + 0x0004, 0x0022, 0x005D, 0x00B1, 0x011D, 0x019F, 0x0236, 0x02E0, + 0x039C, 0x0468, 0x0545, 0x0631, 0x072B, 0x0832, 0x0946, 0x0A67 + }; + static const bits16 sqrtEvenAdjustments[] = { + 0x0A2D, 0x08AF, 0x075A, 0x0629, 0x051A, 0x0429, 0x0356, 0x029E, + 0x0200, 0x0179, 0x0109, 0x00AF, 0x0068, 0x0034, 0x0012, 0x0002 + }; + int8 index; + bits32 z; + + index = ( a>>27 ) & 15; + if ( aExp & 1 ) { + z = 0x4000 + ( a>>17 ) - sqrtOddAdjustments[ index ]; + z = ( ( a / z )<<14 ) + ( z<<15 ); + a >>= 1; + } + else { + z = 0x8000 + ( a>>17 ) - sqrtEvenAdjustments[ index ]; + z = a / z + z; + z = ( 0x20000 <= z ) ? 0xFFFF8000 : ( z<<15 ); + if ( z <= a ) return (bits32) ( ( (sbits32) a )>>1 ); + } + return ( (bits32) ( ( ( (bits64) a )<<31 ) / z ) ) + ( z>>1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 32 is returned. +*----------------------------------------------------------------------------*/ + +static int8 countLeadingZeros32( bits32 a ) +{ + static const int8 countLeadingZerosHigh[] = { + 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int8 shiftCount; + + shiftCount = 0; + if ( a < 0x10000 ) { + shiftCount += 16; + a <<= 16; + } + if ( a < 0x1000000 ) { + shiftCount += 8; + a <<= 8; + } + shiftCount += countLeadingZerosHigh[ a>>24 ]; + return shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Returns the number of leading 0 bits before the most-significant 1 bit of +| `a'. If `a' is zero, 64 is returned. +*----------------------------------------------------------------------------*/ + +static int8 countLeadingZeros64( bits64 a ) +{ + int8 shiftCount; + + shiftCount = 0; + if ( a < ( (bits64) 1 )<<32 ) { + shiftCount += 32; + } + else { + a >>= 32; + } + shiftCount += countLeadingZeros32((bits32)a ); + return shiftCount; + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' +| is equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag eq128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) +{ + + return ( a0 == b0 ) && ( a1 == b1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less +| than or equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag le128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) +{ + + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 <= b1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is less +| than the 128-bit value formed by concatenating `b0' and `b1'. Otherwise, +| returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag lt128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) +{ + + return ( a0 < b0 ) || ( ( a0 == b0 ) && ( a1 < b1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the 128-bit value formed by concatenating `a0' and `a1' is +| not equal to the 128-bit value formed by concatenating `b0' and `b1'. +| Otherwise, returns 0. +*----------------------------------------------------------------------------*/ + +INLINE flag ne128( bits64 a0, bits64 a1, bits64 b0, bits64 b1 ) +{ + + return ( a0 != b0 ) || ( a1 != b1 ); + +} + +/*----------------------------------------------------------------------------- +| Changes the sign of the extended double-precision floating-point value 'a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +INLINE floatx80 floatx80_chs(floatx80 reg) +{ + reg.high ^= 0x8000; + return reg; +} + +/*----------------------------------------------------------------------------- +| Calculates the absolute value of the extended double-precision floating-point +| value `a'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +INLINE floatx80 floatx80_abs(floatx80 reg) +{ + reg.high &= 0x7FFF; + return reg; +} diff --git a/softfloat/fpu_constant.h b/softfloat/fpu_constant.h new file mode 100644 index 00000000..fdd9719e --- /dev/null +++ b/softfloat/fpu_constant.h @@ -0,0 +1,80 @@ +/*============================================================================ +This source file is an extension to the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator) +floating point emulation. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +#ifndef _FPU_CONSTANTS_H_ +#define _FPU_CONSTANTS_H_ + +// Pentium CPU uses only 68-bit precision M_PI approximation +#define BETTER_THAN_PENTIUM + +/*============================================================================ + * Written for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +////////////////////////////// +// PI, PI/2, PI/4 constants +////////////////////////////// + +#define FLOATX80_PI_EXP (0x4000) + +// 128-bit PI fraction +#ifdef BETTER_THAN_PENTIUM +#define FLOAT_PI_HI (0xc90fdaa22168c234U) +#define FLOAT_PI_LO (0xc4c6628b80dc1cd1U) +#else +#define FLOAT_PI_HI (0xc90fdaa22168c234U) +#define FLOAT_PI_LO (0xC000000000000000U) +#endif + +#define FLOATX80_PI2_EXP (0x3FFF) +#define FLOATX80_PI4_EXP (0x3FFE) + +////////////////////////////// +// 3PI/4 constant +////////////////////////////// + +#define FLOATX80_3PI4_EXP (0x4000) + +// 128-bit 3PI/4 fraction +#ifdef BETTER_THAN_PENTIUM +#define FLOAT_3PI4_HI (0x96cbe3f9990e91a7U) +#define FLOAT_3PI4_LO (0x9394c9e8a0a5159cU) +#else +#define FLOAT_3PI4_HI (0x96cbe3f9990e91a7U) +#define FLOAT_3PI4_LO (0x9000000000000000U) +#endif + +////////////////////////////// +// 1/LN2 constant +////////////////////////////// + +#define FLOAT_LN2INV_EXP (0x3FFF) + +// 128-bit 1/LN2 fraction +#ifdef BETTER_THAN_PENTIUM +#define FLOAT_LN2INV_HI (0xb8aa3b295c17f0bbU) +#define FLOAT_LN2INV_LO (0xbe87fed0691d3e89U) +#else +#define FLOAT_LN2INV_HI (0xb8aa3b295c17f0bbU) +#define FLOAT_LN2INV_LO (0xC000000000000000U) +#endif + +#endif diff --git a/softfloat/fsincos.cpp b/softfloat/fsincos.cpp new file mode 100644 index 00000000..695936ed --- /dev/null +++ b/softfloat/fsincos.cpp @@ -0,0 +1,652 @@ +/*============================================================================ +This source file is an extension to the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator) +floating point emulation. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Written for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * ==========================================================================*/ + +#define FLOATX128 + +#define USE_estimateDiv128To64 +#include "mamesf.h" +#include "softfloat.h" +//#include "softfloat-specialize" +#include "fpu_constant.h" + +static const floatx80 floatx80_one = { 0x3fff, 0x8000000000000000U }; +static const floatx80 floatx80_default_nan = { 0xffff, 0xffffffffffffffffU }; + +#define packFloat2x128m(zHi, zLo) {(zHi), (zLo)} +#define PACK_FLOAT_128(hi,lo) packFloat2x128m(LIT64(hi),LIT64(lo)) + +#define EXP_BIAS 0x3FFF + +#if 0 /* Included in softfloat-specialize" */ +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +INLINE bits64 extractFloatx80Frac( floatx80 a ) +{ + return a.low; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int32 extractFloatx80Exp( floatx80 a ) +{ + return a.high & 0x7FFF; + +} +#endif + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the extended double-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloatx80Sign( floatx80 a ) +{ + return a.high>>15; + +} + +/*---------------------------------------------------------------------------- +| Takes extended double-precision floating-point NaN `a' and returns the +| appropriate NaN result. If `a' is a signaling NaN, the invalid exception +| is raised. +*----------------------------------------------------------------------------*/ + +INLINE floatx80 propagateFloatx80NaNOneArg(floatx80 a) +{ + if (floatx80_is_signaling_nan(a)) + float_raise(float_flag_invalid); + + a.low |= 0xC000000000000000U; + + return a; +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal extended double-precision floating-point value +| represented by the denormalized significand `aSig'. The normalized exponent +| and significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr) +{ + int shiftCount = countLeadingZeros64(aSig); + *zSigPtr = aSig< 0) { + q = argument_reduction_kernel(*aSig0, expDiff, aSig0, aSig1); + } + else { + if (FLOAT_PI_HI <= *aSig0) { + *aSig0 -= FLOAT_PI_HI; + q = 1; + } + } + + shift128Right(FLOAT_PI_HI, FLOAT_PI_LO, 1, &term0, &term1); + if (! lt128(*aSig0, *aSig1, term0, term1)) + { + int lt = lt128(term0, term1, *aSig0, *aSig1); + int eq = eq128(*aSig0, *aSig1, term0, term1); + + if ((eq && (q & 1)) || lt) { + *zSign = !(*zSign); + ++q; + } + if (lt) sub128(FLOAT_PI_HI, FLOAT_PI_LO, *aSig0, *aSig1, aSig0, aSig1); + } + + return (int)(q & 3); +} + +#define SIN_ARR_SIZE 11 +#define COS_ARR_SIZE 11 + +static float128 sin_arr[SIN_ARR_SIZE] = +{ + PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 1 */ + PACK_FLOAT_128(0xbffc555555555555, 0x5555555555555555), /* 3 */ + PACK_FLOAT_128(0x3ff8111111111111, 0x1111111111111111), /* 5 */ + PACK_FLOAT_128(0xbff2a01a01a01a01, 0xa01a01a01a01a01a), /* 7 */ + PACK_FLOAT_128(0x3fec71de3a556c73, 0x38faac1c88e50017), /* 9 */ + PACK_FLOAT_128(0xbfe5ae64567f544e, 0x38fe747e4b837dc7), /* 11 */ + PACK_FLOAT_128(0x3fde6124613a86d0, 0x97ca38331d23af68), /* 13 */ + PACK_FLOAT_128(0xbfd6ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 15 */ + PACK_FLOAT_128(0x3fce952c77030ad4, 0xa6b2605197771b00), /* 17 */ + PACK_FLOAT_128(0xbfc62f49b4681415, 0x724ca1ec3b7b9675), /* 19 */ + PACK_FLOAT_128(0x3fbd71b8ef6dcf57, 0x18bef146fcee6e45) /* 21 */ +}; + +static float128 cos_arr[COS_ARR_SIZE] = +{ + PACK_FLOAT_128(0x3fff000000000000, 0x0000000000000000), /* 0 */ + PACK_FLOAT_128(0xbffe000000000000, 0x0000000000000000), /* 2 */ + PACK_FLOAT_128(0x3ffa555555555555, 0x5555555555555555), /* 4 */ + PACK_FLOAT_128(0xbff56c16c16c16c1, 0x6c16c16c16c16c17), /* 6 */ + PACK_FLOAT_128(0x3fefa01a01a01a01, 0xa01a01a01a01a01a), /* 8 */ + PACK_FLOAT_128(0xbfe927e4fb7789f5, 0xc72ef016d3ea6679), /* 10 */ + PACK_FLOAT_128(0x3fe21eed8eff8d89, 0x7b544da987acfe85), /* 12 */ + PACK_FLOAT_128(0xbfda93974a8c07c9, 0xd20badf145dfa3e5), /* 14 */ + PACK_FLOAT_128(0x3fd2ae7f3e733b81, 0xf11d8656b0ee8cb0), /* 16 */ + PACK_FLOAT_128(0xbfca6827863b97d9, 0x77bb004886a2c2ab), /* 18 */ + PACK_FLOAT_128(0x3fc1e542ba402022, 0x507a9cad2bf8f0bb) /* 20 */ +}; + +extern float128 OddPoly (float128 x, float128 *arr, unsigned n); + +/* 0 <= x <= pi/4 */ +INLINE float128 poly_sin(float128 x) +{ + // 3 5 7 9 11 13 15 + // x x x x x x x + // sin (x) ~ x - --- + --- - --- + --- - ---- + ---- - ---- = + // 3! 5! 7! 9! 11! 13! 15! + // + // 2 4 6 8 10 12 14 + // x x x x x x x + // = x * [ 1 - --- + --- - --- + --- - ---- + ---- - ---- ] = + // 3! 5! 7! 9! 11! 13! 15! + // + // 3 3 + // -- 4k -- 4k+2 + // p(x) = > C * x > 0 q(x) = > C * x < 0 + // -- 2k -- 2k+1 + // k=0 k=0 + // + // 2 + // sin(x) ~ x * [ p(x) + x * q(x) ] + // + + return OddPoly(x, sin_arr, SIN_ARR_SIZE); +} + +extern float128 EvenPoly(float128 x, float128 *arr, unsigned n); + +/* 0 <= x <= pi/4 */ +INLINE float128 poly_cos(float128 x) +{ + // 2 4 6 8 10 12 14 + // x x x x x x x + // cos (x) ~ 1 - --- + --- - --- + --- - ---- + ---- - ---- + // 2! 4! 6! 8! 10! 12! 14! + // + // 3 3 + // -- 4k -- 4k+2 + // p(x) = > C * x > 0 q(x) = > C * x < 0 + // -- 2k -- 2k+1 + // k=0 k=0 + // + // 2 + // cos(x) ~ [ p(x) + x * q(x) ] + // + + return EvenPoly(x, cos_arr, COS_ARR_SIZE); +} + +INLINE void sincos_invalid(floatx80 *sin_a, floatx80 *cos_a, floatx80 a) +{ + if (sin_a) *sin_a = a; + if (cos_a) *cos_a = a; +} + +INLINE void sincos_tiny_argument(floatx80 *sin_a, floatx80 *cos_a, floatx80 a) +{ + if (sin_a) *sin_a = a; + if (cos_a) *cos_a = floatx80_one; +} + +static floatx80 sincos_approximation(int neg, float128 r, uint64_t quotient) +{ + if (quotient & 0x1) { + r = poly_cos(r); + neg = 0; + } else { + r = poly_sin(r); + } + + floatx80 result = float128_to_floatx80(r); + if (quotient & 0x2) + neg = ! neg; + + if (neg) + result = floatx80_chs(result); + + return result; +} + +// ================================================= +// SFFSINCOS Compute sin(x) and cos(x) +// ================================================= + +// +// Uses the following identities: +// ---------------------------------------------------------- +// +// sin(-x) = -sin(x) +// cos(-x) = cos(x) +// +// sin(x+y) = sin(x)*cos(y)+cos(x)*sin(y) +// cos(x+y) = sin(x)*sin(y)+cos(x)*cos(y) +// +// sin(x+ pi/2) = cos(x) +// sin(x+ pi) = -sin(x) +// sin(x+3pi/2) = -cos(x) +// sin(x+2pi) = sin(x) +// + +int sf_fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a) +{ + uint64_t aSig0, aSig1 = 0; + int32_t aExp, zExp, expDiff; + int aSign, zSign; + int q = 0; + + aSig0 = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + + /* invalid argument */ + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig0<<1)) { + sincos_invalid(sin_a, cos_a, propagateFloatx80NaNOneArg(a)); + return 0; + } + + float_raise(float_flag_invalid); + sincos_invalid(sin_a, cos_a, floatx80_default_nan); + return 0; + } + + if (aExp == 0) { + if (aSig0 == 0) { + sincos_tiny_argument(sin_a, cos_a, a); + return 0; + } + +// float_raise(float_flag_denormal); + + /* handle pseudo denormals */ + if (! (aSig0 & 0x8000000000000000U)) + { + float_raise(float_flag_inexact); + if (sin_a) + float_raise(float_flag_underflow); + sincos_tiny_argument(sin_a, cos_a, a); + return 0; + } + + normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0); + } + + zSign = aSign; + zExp = EXP_BIAS; + expDiff = aExp - zExp; + + /* argument is out-of-range */ + if (expDiff >= 63) + return -1; + + float_raise(float_flag_inexact); + + if (expDiff < -1) { // doesn't require reduction + if (expDiff <= -68) { + a = packFloatx80(aSign, aExp, aSig0); + sincos_tiny_argument(sin_a, cos_a, a); + return 0; + } + zExp = aExp; + } + else { + q = reduce_trig_arg(expDiff, &zSign, &aSig0, &aSig1); + } + + /* **************************** */ + /* argument reduction completed */ + /* **************************** */ + + /* using float128 for approximation */ + float128 r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1); + + if (aSign) q = -q; + if (cos_a) *cos_a = sincos_approximation(zSign, r, q+1); + if (sin_a) *sin_a = sincos_approximation(zSign, r, q); + + return 0; +} + +int floatx80_fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a) +{ + return sf_fsincos(a, sin_a, cos_a); +} + +int floatx80_fsin(floatx80 *a) +{ + return sf_fsincos(*a, a, 0); +} + +int floatx80_fcos(floatx80 *a) +{ + return sf_fsincos(*a, 0, a); +} + +// ================================================= +// FPTAN Compute tan(x) +// ================================================= + +// +// Uses the following identities: +// +// 1. ---------------------------------------------------------- +// +// sin(-x) = -sin(x) +// cos(-x) = cos(x) +// +// sin(x+y) = sin(x)*cos(y)+cos(x)*sin(y) +// cos(x+y) = sin(x)*sin(y)+cos(x)*cos(y) +// +// sin(x+ pi/2) = cos(x) +// sin(x+ pi) = -sin(x) +// sin(x+3pi/2) = -cos(x) +// sin(x+2pi) = sin(x) +// +// 2. ---------------------------------------------------------- +// +// sin(x) +// tan(x) = ------ +// cos(x) +// + +int floatx80_ftan(floatx80 *a) +{ + uint64_t aSig0, aSig1 = 0; + int32_t aExp, zExp, expDiff; + int aSign, zSign; + int q = 0; + + aSig0 = extractFloatx80Frac(*a); + aExp = extractFloatx80Exp(*a); + aSign = extractFloatx80Sign(*a); + + /* invalid argument */ + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig0<<1)) + { + *a = propagateFloatx80NaNOneArg(*a); + return 0; + } + + float_raise(float_flag_invalid); + *a = floatx80_default_nan; + return 0; + } + + if (aExp == 0) { + if (aSig0 == 0) return 0; +// float_raise(float_flag_denormal); + /* handle pseudo denormals */ + if (! (aSig0 & 0x8000000000000000U)) + { + float_raise(float_flag_inexact | float_flag_underflow); + return 0; + } + normalizeFloatx80Subnormal(aSig0, &aExp, &aSig0); + } + + zSign = aSign; + zExp = EXP_BIAS; + expDiff = aExp - zExp; + + /* argument is out-of-range */ + if (expDiff >= 63) + return -1; + + float_raise(float_flag_inexact); + + if (expDiff < -1) { // doesn't require reduction + if (expDiff <= -68) { + *a = packFloatx80(aSign, aExp, aSig0); + return 0; + } + zExp = aExp; + } + else { + q = reduce_trig_arg(expDiff, &zSign, &aSig0, &aSig1); + } + + /* **************************** */ + /* argument reduction completed */ + /* **************************** */ + + /* using float128 for approximation */ + float128 r = normalizeRoundAndPackFloat128(0, zExp-0x10, aSig0, aSig1); + + float128 sin_r = poly_sin(r); + float128 cos_r = poly_cos(r); + + if (q & 0x1) { + r = float128_div(cos_r, sin_r); + zSign = ! zSign; + } else { + r = float128_div(sin_r, cos_r); + } + + *a = float128_to_floatx80(r); + if (zSign) + *a = floatx80_chs(*a); + + return 0; +} + +// 2 3 4 n +// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x) +// 0 1 2 3 4 n +// +// -- 2k -- 2k+1 +// p(x) = > C * x q(x) = > C * x +// -- 2k -- 2k+1 +// +// f(x) ~ [ p(x) + x * q(x) ] +// + +float128 EvalPoly(float128 x, float128 *arr, unsigned n) +{ + float128 x2 = float128_mul(x, x); + unsigned i; + + assert(n > 1); + + float128 r1 = arr[--n]; + i = n; + while(i >= 2) { + r1 = float128_mul(r1, x2); + i -= 2; + r1 = float128_add(r1, arr[i]); + } + if (i) r1 = float128_mul(r1, x); + + float128 r2 = arr[--n]; + i = n; + while(i >= 2) { + r2 = float128_mul(r2, x2); + i -= 2; + r2 = float128_add(r2, arr[i]); + } + if (i) r2 = float128_mul(r2, x); + + return float128_add(r1, r2); +} + +// 2 4 6 8 2n +// f(x) ~ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x) +// 0 1 2 3 4 n +// +// -- 4k -- 4k+2 +// p(x) = > C * x q(x) = > C * x +// -- 2k -- 2k+1 +// +// 2 +// f(x) ~ [ p(x) + x * q(x) ] +// + +float128 EvenPoly(float128 x, float128 *arr, unsigned n) +{ + return EvalPoly(float128_mul(x, x), arr, n); +} + +// 3 5 7 9 2n+1 +// f(x) ~ (C * x) + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x) +// 0 1 2 3 4 n +// 2 4 6 8 2n +// = x * [ C + (C * x) + (C * x) + (C * x) + (C * x) + ... + (C * x) +// 0 1 2 3 4 n +// +// -- 4k -- 4k+2 +// p(x) = > C * x q(x) = > C * x +// -- 2k -- 2k+1 +// +// 2 +// f(x) ~ x * [ p(x) + x * q(x) ] +// + +float128 OddPoly(float128 x, float128 *arr, unsigned n) +{ + return float128_mul(x, EvenPoly(x, arr, n)); +} + +/*---------------------------------------------------------------------------- +| Scales extended double-precision floating-point value in operand `a' by +| value `b'. The function truncates the value in the second operand 'b' to +| an integral value and adds that value to the exponent of the operand 'a'. +| The operation performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +extern floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b ); + +floatx80 floatx80_scale(floatx80 a, floatx80 b) +{ + sbits32 aExp, bExp; + bits64 aSig, bSig; + + // handle unsupported extended double-precision floating encodings +/* if (floatx80_is_unsupported(a) || floatx80_is_unsupported(b)) + { + float_raise(float_flag_invalid); + return floatx80_default_nan; + }*/ + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + int bSign = extractFloatx80Sign(b); + + if (aExp == 0x7FFF) { + if ((bits64) (aSig<<1) || ((bExp == 0x7FFF) && (bits64) (bSig<<1))) + { + return propagateFloatx80NaN(a, b); + } + if ((bExp == 0x7FFF) && bSign) { + float_raise(float_flag_invalid); + return floatx80_default_nan; + } + if (bSig && (bExp == 0)) float_raise(float_flag_denormal); + return a; + } + if (bExp == 0x7FFF) { + if ((bits64) (bSig<<1)) return propagateFloatx80NaN(a, b); + if ((aExp | aSig) == 0) { + if (! bSign) { + float_raise(float_flag_invalid); + return floatx80_default_nan; + } + return a; + } + if (aSig && (aExp == 0)) float_raise(float_flag_denormal); + if (bSign) return packFloatx80(aSign, 0, 0); + return packFloatx80(aSign, 0x7FFF, 0x8000000000000000U); + } + if (aExp == 0) { + if (aSig == 0) return a; + float_raise(float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) return a; + float_raise(float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + + if (bExp > 0x400E) { + /* generate appropriate overflow/underflow */ + return roundAndPackFloatx80(80, aSign, + bSign ? -0x3FFF : 0x7FFF, aSig, 0); + } + if (bExp < 0x3FFF) return a; + + int shiftCount = 0x403E - bExp; + bSig >>= shiftCount; + sbits32 scale = bSig; + if (bSign) scale = -scale; /* -32768..32767 */ + return + roundAndPackFloatx80(80, aSign, aExp+scale, aSig, 0); +} diff --git a/softfloat/fyl2x.cpp b/softfloat/fyl2x.cpp new file mode 100644 index 00000000..28dddb17 --- /dev/null +++ b/softfloat/fyl2x.cpp @@ -0,0 +1,494 @@ +/*============================================================================ +This source file is an extension to the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b, written for Bochs (x86 achitecture simulator) +floating point emulation. +float_raise(float_flag_invalid) +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. +=============================================================================*/ + +/*============================================================================ + * Written for Bochs (x86 achitecture simulator) by + * Stanislav Shwartsman [sshwarts at sourceforge net] + * Adapted for lib/softfloat in MESS by Hans Ostermeyer (03/2012) + * ==========================================================================*/ + +#define FLOATX128 + +#define USE_estimateDiv128To64 +#include "mamesf.h" +#include "softfloat.h" +//#include "softfloat-specialize" +#include "fpu_constant.h" +/* +static const floatx80 floatx80_log10_2 = packFloatx80(0, 0x3ffd, 0x9a209a84fbcff798U); +static const floatx80 floatx80_ln_2 = packFloatx80(0, 0x3ffe, 0xb17217f7d1cf79acU); +static const floatx80 floatx80_one = packFloatx80(0, 0x3fff, 0x8000000000000000U); +static const floatx80 floatx80_default_nan = packFloatx80(0, 0xffff, 0xffffffffffffffffU); +*/ +static const floatx80 floatx80_log10_2 = { 0x3ffd, 0x9a209a84fbcff798U }; +static const floatx80 floatx80_ln_2 = { 0x3ffe, 0xb17217f7d1cf79acU }; +static const floatx80 floatx80_one = { 0x3fff, 0x8000000000000000U }; +static const floatx80 floatx80_default_nan = { 0xffff, 0xffffffffffffffffU }; + +#define packFloat_128(zHi, zLo) {(zHi), (zLo)} +#define PACK_FLOAT_128(hi,lo) packFloat_128(LIT64(hi),LIT64(lo)) + +#define EXP_BIAS 0x3FFF + +#if 0 /* Included in softfloat-specialize */ +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +INLINE bits64 extractFloatx80Frac( floatx80 a ) +{ + return a.low; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int32 extractFloatx80Exp( floatx80 a ) +{ + return a.high & 0x7FFF; + +} +#endif + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the extended double-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloatx80Sign( floatx80 a ) +{ + return a.high>>15; + +} + +#if 0 +/*---------------------------------------------------------------------------- +| Takes extended double-precision floating-point NaN `a' and returns the +| appropriate NaN result. If `a' is a signaling NaN, the invalid exception +| is raised. +*----------------------------------------------------------------------------*/ + +INLINE floatx80 propagateFloatx80NaNOneArg(floatx80 a) +{ + if (floatx80_is_signaling_nan(a)) + float_raise(float_flag_invalid); + + a.low |= 0xC000000000000000U; + + return a; +} +#endif + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal extended double-precision floating-point value +| represented by the denormalized significand `aSig'. The normalized exponent +| and significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +INLINE void normalizeFloatx80Subnormal(uint64_t aSig, int32_t *zExpPtr, uint64_t *zSigPtr) +{ + int shiftCount = countLeadingZeros64(aSig); + *zSigPtr = aSig< C * u q(u) = > C * u + // -- 2k -- 2k+1 + // k=0 k=0 + // + // 1+u 2 + // 1/2 ln --- ~ u * [ p(u) + u * q(u) ] + // 1-u + // +*/ + return OddPoly(x1, ln_arr, L2_ARR_SIZE); +} + +/* required sqrt(2)/2 < x < sqrt(2) */ +static float128 poly_l2(float128 x) +{ + /* using float128 for approximation */ + float128 x_p1 = float128_add(x, float128_one); + float128 x_m1 = float128_sub(x, float128_one); + x = float128_div(x_m1, x_p1); + x = poly_ln(x); + x = float128_mul(x, float128_ln2inv2); + return x; +} + +static float128 poly_l2p1(float128 x) +{ + /* using float128 for approximation */ + float128 x_p2 = float128_add(x, float128_two); + x = float128_div(x, x_p2); + x = poly_ln(x); + x = float128_mul(x, float128_ln2inv2); + return x; +} + +// ================================================= +// FYL2X Compute y * log (x) +// 2 +// ================================================= + +// +// Uses the following identities: +// +// 1. ---------------------------------------------------------- +// ln(x) +// log (x) = -------, ln (x*y) = ln(x) + ln(y) +// 2 ln(2) +// +// 2. ---------------------------------------------------------- +// 1+u x-1 +// ln (x) = ln -----, when u = ----- +// 1-u x+1 +// +// 3. ---------------------------------------------------------- +// 3 5 7 2n+1 +// 1+u u u u u +// ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ] +// 1-u 3 5 7 2n+1 +// + +static floatx80 fyl2x(floatx80 a, floatx80 b) +{ + uint64_t aSig = extractFloatx80Frac(a); + int32_t aExp = extractFloatx80Exp(a); + int aSign = extractFloatx80Sign(a); + uint64_t bSig = extractFloatx80Frac(b); + int32_t bExp = extractFloatx80Exp(b); + int bSign = extractFloatx80Sign(b); + + int zSign = bSign ^ 1; + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1) + || ((bExp == 0x7FFF) && (uint64_t) (bSig<<1))) + { + return propagateFloatx80NaN(a, b); + } + if (aSign) + { +invalid: + float_raise(float_flag_invalid); + return floatx80_default_nan; + } + else { + if (bExp == 0) { + if (bSig == 0) goto invalid; + float_raise(float_flag_denormal); + } + return packFloatx80(bSign, 0x7FFF, 0x8000000000000000U); + } + } + if (bExp == 0x7FFF) + { + if ((uint64_t) (bSig<<1)) return propagateFloatx80NaN(a, b); + if (aSign && (uint64_t)(aExp | aSig)) goto invalid; + if (aSig && (aExp == 0)) + float_raise(float_flag_denormal); + if (aExp < 0x3FFF) { + return packFloatx80(zSign, 0x7FFF, 0x8000000000000000U); + } + if (aExp == 0x3FFF && ((uint64_t) (aSig<<1) == 0)) goto invalid; + return packFloatx80(bSign, 0x7FFF, 0x8000000000000000U); + } + if (aExp == 0) { + if (aSig == 0) { + if ((bExp | bSig) == 0) goto invalid; + float_raise(float_flag_divbyzero); + return packFloatx80(zSign, 0x7FFF, 0x8000000000000000U); + } + if (aSign) goto invalid; + float_raise(float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (aSign) goto invalid; + if (bExp == 0) { + if (bSig == 0) { + if (aExp < 0x3FFF) return packFloatx80(zSign, 0, 0); + return packFloatx80(bSign, 0, 0); + } + float_raise(float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + if (aExp == 0x3FFF && ((uint64_t) (aSig<<1) == 0)) + return packFloatx80(bSign, 0, 0); + + float_raise(float_flag_inexact); + + int ExpDiff = aExp - 0x3FFF; + aExp = 0; + if (aSig >= SQRT2_HALF_SIG) { + ExpDiff++; + aExp--; + } + + /* ******************************** */ + /* using float128 for approximation */ + /* ******************************** */ + + uint64_t zSig0, zSig1; + shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1); + float128 x = packFloat128(0, aExp+0x3FFF, zSig0, zSig1); + x = poly_l2(x); + x = float128_add(x, int64_to_float128((int64_t) ExpDiff)); + return floatx80_mul(b, float128_to_floatx80(x)); +} + +// ================================================= +// FYL2XP1 Compute y * log (x + 1) +// 2 +// ================================================= + +// +// Uses the following identities: +// +// 1. ---------------------------------------------------------- +// ln(x) +// log (x) = ------- +// 2 ln(2) +// +// 2. ---------------------------------------------------------- +// 1+u x +// ln (x+1) = ln -----, when u = ----- +// 1-u x+2 +// +// 3. ---------------------------------------------------------- +// 3 5 7 2n+1 +// 1+u u u u u +// ln ----- = 2 [ u + --- + --- + --- + ... + ------ + ... ] +// 1-u 3 5 7 2n+1 +// + +floatx80 fyl2xp1(floatx80 a, floatx80 b) +{ + int32_t aExp, bExp; + uint64_t aSig, bSig, zSig0, zSig1, zSig2; + int aSign, bSign; + + aSig = extractFloatx80Frac(a); + aExp = extractFloatx80Exp(a); + aSign = extractFloatx80Sign(a); + bSig = extractFloatx80Frac(b); + bExp = extractFloatx80Exp(b); + bSign = extractFloatx80Sign(b); + int zSign = aSign ^ bSign; + + if (aExp == 0x7FFF) { + if ((uint64_t) (aSig<<1) + || ((bExp == 0x7FFF) && (uint64_t) (bSig<<1))) + { + return propagateFloatx80NaN(a, b); + } + if (aSign) + { +invalid: + float_raise(float_flag_invalid); + return floatx80_default_nan; + } + else { + if (bExp == 0) { + if (bSig == 0) goto invalid; + float_raise(float_flag_denormal); + } + return packFloatx80(bSign, 0x7FFF, 0x8000000000000000U); + } + } + if (bExp == 0x7FFF) + { + if ((uint64_t) (bSig<<1)) + return propagateFloatx80NaN(a, b); + + if (aExp == 0) { + if (aSig == 0) goto invalid; + float_raise(float_flag_denormal); + } + + return packFloatx80(zSign, 0x7FFF, 0x8000000000000000U); + } + if (aExp == 0) { + if (aSig == 0) { + if (bSig && (bExp == 0)) float_raise(float_flag_denormal); + return packFloatx80(zSign, 0, 0); + } + float_raise(float_flag_denormal); + normalizeFloatx80Subnormal(aSig, &aExp, &aSig); + } + if (bExp == 0) { + if (bSig == 0) return packFloatx80(zSign, 0, 0); + float_raise(float_flag_denormal); + normalizeFloatx80Subnormal(bSig, &bExp, &bSig); + } + + float_raise(float_flag_inexact); + + if (aSign && aExp >= 0x3FFF) + return a; + + if (aExp >= 0x3FFC) // big argument + { + return fyl2x(floatx80_add(a, floatx80_one), b); + } + + // handle tiny argument + if (aExp < EXP_BIAS-70) + { + // first order approximation, return (a*b)/ln(2) + int32_t zExp = aExp + FLOAT_LN2INV_EXP - 0x3FFE; + + mul128By64To192(FLOAT_LN2INV_HI, FLOAT_LN2INV_LO, aSig, &zSig0, &zSig1, &zSig2); + if (0 < (int64_t) zSig0) { + shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1); + --zExp; + } + + zExp = zExp + bExp - 0x3FFE; + mul128By64To192(zSig0, zSig1, bSig, &zSig0, &zSig1, &zSig2); + if (0 < (int64_t) zSig0) { + shortShift128Left(zSig0, zSig1, 1, &zSig0, &zSig1); + --zExp; + } + + return + roundAndPackFloatx80(80, aSign ^ bSign, zExp, zSig0, zSig1); + } + + /* ******************************** */ + /* using float128 for approximation */ + /* ******************************** */ + + shift128Right(aSig<<1, 0, 16, &zSig0, &zSig1); + float128 x = packFloat128(aSign, aExp, zSig0, zSig1); + x = poly_l2p1(x); + return floatx80_mul(b, float128_to_floatx80(x)); +} + +floatx80 floatx80_flognp1(floatx80 a) +{ + return fyl2xp1(a, floatx80_ln_2); +} + +floatx80 floatx80_flogn(floatx80 a) +{ + return fyl2x(a, floatx80_ln_2); +} + +floatx80 floatx80_flog2(floatx80 a) +{ + return fyl2x(a, floatx80_one); +} + +floatx80 floatx80_flog10(floatx80 a) +{ + return fyl2x(a, floatx80_log10_2); +} diff --git a/softfloat/mamesf.h b/softfloat/mamesf.h new file mode 100644 index 00000000..9e55a228 --- /dev/null +++ b/softfloat/mamesf.h @@ -0,0 +1,72 @@ +#ifndef MAMESF_H +#define MAMESF_H + +/*---------------------------------------------------------------------------- +| One of the macros `BIGENDIAN' or `LITTLEENDIAN' must be defined. +*----------------------------------------------------------------------------*/ +#ifdef LSB_FIRST +#define LITTLEENDIAN +#else +#define BIGENDIAN +#endif + +/*---------------------------------------------------------------------------- +| The macro `BITS64' can be defined to indicate that 64-bit integer types are +| supported by the compiler. +*----------------------------------------------------------------------------*/ +#define BITS64 + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines the most convenient type that holds +| integers of at least as many bits as specified. For example, `uint8' should +| be the most convenient type that can hold unsigned integers of as many as +| 8 bits. The `flag' type must be able to hold either a 0 or 1. For most +| implementations of C, `flag', `uint8', and `int8' should all be `typedef'ed +| to the same as `int'. +*----------------------------------------------------------------------------*/ +//#include "assert.h" +#include "sysdeps.h" + +typedef int8_t flag; +typedef uint8_t uint8; +typedef int8_t int8; +typedef uint16_t uint16; +typedef int16_t int16; +typedef uint32_t uint32; +typedef int32_t int32; +typedef uint64_t uint64; +typedef int64_t int64; + +/*---------------------------------------------------------------------------- +| Each of the following `typedef's defines a type that holds integers +| of _exactly_ the number of bits specified. For instance, for most +| implementation of C, `bits16' and `sbits16' should be `typedef'ed to +| `unsigned short int' and `signed short int' (or `short int'), respectively. +*----------------------------------------------------------------------------*/ +typedef uint8_t bits8; +typedef int8_t sbits8; +typedef uint16_t bits16; +typedef int16_t sbits16; +typedef uint32_t bits32; +typedef int32_t sbits32; +typedef uint64_t bits64; +typedef int64_t sbits64; + +/*---------------------------------------------------------------------------- +| The `LIT64' macro takes as its argument a textual integer literal and +| if necessary ``marks'' the literal as having a 64-bit integer type. +| For example, the GNU C Compiler (`gcc') requires that 64-bit literals be +| appended with the letters `LL' standing for `long long', which is `gcc's +| name for the 64-bit integer type. Some compilers may allow `LIT64' to be +| defined as the identity macro: `#define LIT64( a ) a'. +*----------------------------------------------------------------------------*/ +#define LIT64( a ) a##ULL + +/*---------------------------------------------------------------------------- +| The macro `INLINE' can be used before functions that should be inlined. If +| a compiler does not support explicit inlining, this macro should be defined +| to be `static'. +*----------------------------------------------------------------------------*/ +#define INLINE static inline + +#endif //MAMESF_H \ No newline at end of file diff --git a/softfloat/milieu.h b/softfloat/milieu.h new file mode 100644 index 00000000..10687b75 --- /dev/null +++ b/softfloat/milieu.h @@ -0,0 +1,42 @@ + +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Include common integer types and flags. +*----------------------------------------------------------------------------*/ +#include "mamesf.h" + +/*---------------------------------------------------------------------------- +| Symbolic Boolean literals. +*----------------------------------------------------------------------------*/ +#define FALSE 0 +#define TRUE 1 diff --git a/softfloat/softfloat-specialize.h b/softfloat/softfloat-specialize.h new file mode 100644 index 00000000..f0ec9ef2 --- /dev/null +++ b/softfloat/softfloat-specialize.h @@ -0,0 +1,554 @@ + +/*============================================================================ + +This C source fragment is part of the SoftFloat IEC/IEEE Floating-point +Arithmetic Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +/*---------------------------------------------------------------------------- +| Underflow tininess-detection mode, statically initialized to default value. +| (The declaration in `softfloat.h' must match the `int8' type here.) +*----------------------------------------------------------------------------*/ +int8 float_detect_tininess = float_tininess_after_rounding; + +/*---------------------------------------------------------------------------- +| Raises the exceptions specified by `flags'. Floating-point traps can be +| defined here if desired. It is currently not possible for such a trap to +| substitute a result value. If traps are not implemented, this routine +| should be simply `float_exception_flags |= flags;'. +*----------------------------------------------------------------------------*/ + +void float_raise( int8 flags ) +{ + + float_exception_flags |= flags; + +} + +/*---------------------------------------------------------------------------- +| Internal canonical NaN format. +*----------------------------------------------------------------------------*/ +typedef struct { + flag sign; + bits64 high, low; +} commonNaNT; + +/*---------------------------------------------------------------------------- +| The pattern for a default generated single-precision NaN. +*----------------------------------------------------------------------------*/ +#define float32_default_nan 0xFFFFFFFF + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float32_is_nan( float32 a ) +{ + + return ( 0xFF000000 < (bits32) ( a<<1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float32_is_signaling_nan( float32 a ) +{ + + return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float32ToCommonNaN( float32 a ) +{ + commonNaNT z; + + if ( float32_is_signaling_nan( a ) ) float_raise( float_flag_invalid ); + z.sign = a>>31; + z.low = 0; + z.high = ( (bits64) a )<<41; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the single- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float32 commonNaNToFloat32( commonNaNT a ) +{ + + return ( ( (bits32) a.sign )<<31 ) | 0x7FC00000 | ( a.high>>41 ); + +} + +/*---------------------------------------------------------------------------- +| Takes two single-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float32 propagateFloat32NaN( float32 a, float32 b ) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float32_is_nan( a ); + aIsSignalingNaN = float32_is_signaling_nan( a ); + bIsNaN = float32_is_nan( b ); + bIsSignalingNaN = float32_is_signaling_nan( b ); + a |= 0x00400000; + b |= 0x00400000; + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid ); + if ( aIsNaN ) { + return ( aIsSignalingNaN & bIsNaN ) ? b : a; + } + else { + return b; + } + +} + +/*---------------------------------------------------------------------------- +| The pattern for a default generated double-precision NaN. +*----------------------------------------------------------------------------*/ +#define float64_default_nan LIT64( 0xFFFFFFFFFFFFFFFF ) + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float64_is_nan( float64 a ) +{ + + return ( LIT64( 0xFFE0000000000000 ) < (bits64) ( a<<1 ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is a signaling +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float64_is_signaling_nan( float64 a ) +{ + + return + ( ( ( a>>51 ) & 0xFFF ) == 0xFFE ) + && ( a & LIT64( 0x0007FFFFFFFFFFFF ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float64ToCommonNaN( float64 a ) +{ + commonNaNT z; + + if ( float64_is_signaling_nan( a ) ) float_raise( float_flag_invalid ); + z.sign = a>>63; + z.low = 0; + z.high = a<<12; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the double- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float64 commonNaNToFloat64( commonNaNT a ) +{ + + return + ( ( (bits64) a.sign )<<63 ) + | LIT64( 0x7FF8000000000000 ) + | ( a.high>>12 ); + +} + +/*---------------------------------------------------------------------------- +| Takes two double-precision floating-point values `a' and `b', one of which +| is a NaN, and returns the appropriate NaN result. If either `a' or `b' is a +| signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float64 propagateFloat64NaN( float64 a, float64 b ) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float64_is_nan( a ); + aIsSignalingNaN = float64_is_signaling_nan( a ); + bIsNaN = float64_is_nan( b ); + bIsSignalingNaN = float64_is_signaling_nan( b ); + a |= LIT64( 0x0008000000000000 ); + b |= LIT64( 0x0008000000000000 ); + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid ); + if ( aIsNaN ) { + return ( aIsSignalingNaN & bIsNaN ) ? b : a; + } + else { + return b; + } + +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| The pattern for a default generated extended double-precision NaN. The +| `high' and `low' values hold the most- and least-significant bits, +| respectively. +*----------------------------------------------------------------------------*/ +#define floatx80_default_nan_high 0xFFFF +#define floatx80_default_nan_low LIT64( 0xFFFFFFFFFFFFFFFF ) + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is a +| NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag floatx80_is_nan( floatx80 a ) +{ + + return ( ( a.high & 0x7FFF ) == 0x7FFF ) && (bits64) ( a.low<<1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is a +| signaling NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag floatx80_is_signaling_nan( floatx80 a ) +{ + bits64 aLow; + + aLow = a.low & ~ LIT64( 0x4000000000000000 ); + return + ( ( a.high & 0x7FFF ) == 0x7FFF ) + && (bits64) ( aLow<<1 ) + && ( a.low == aLow ); + +} + +// 28-12-2016: Added for Previous: + +/*---------------------------------------------------------------------------- + | Returns 1 if the extended double-precision floating-point value `a' is + | zero; otherwise returns 0. + *----------------------------------------------------------------------------*/ + +flag floatx80_is_zero( floatx80 a ) +{ + + return ( ( a.high & 0x7FFF ) == 0 ) && ( a.low == 0 ); + +} + +/*---------------------------------------------------------------------------- + | Returns 1 if the extended double-precision floating-point value `a' is + | infinity; otherwise returns 0. + *----------------------------------------------------------------------------*/ + +flag floatx80_is_infinity( floatx80 a ) +{ + + return ( ( a.high & 0x7FFF ) == 0x7FFF ) && ( (bits64) ( a.low<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- + | Returns 1 if the extended double-precision floating-point value `a' is + | negative; otherwise returns 0. + *----------------------------------------------------------------------------*/ + +flag floatx80_is_negative( floatx80 a ) +{ + + return ( ( a.high & 0x8000 ) == 0x8000 ); + +} + +/*---------------------------------------------------------------------------- + | Returns 1 if the extended double-precision floating-point value `a' is + | denormal; otherwise returns 0. + *----------------------------------------------------------------------------*/ + +flag floatx80_is_denormal( floatx80 a ) +{ + + return + ( ( a.high & 0x7FFF ) == 0 ) + && ( (bits64) ( a.low & LIT64( 0x8000000000000000 ) ) == LIT64( 0x0000000000000000 ) ) + && (bits64) ( a.low<<1 ); + +} + +/*---------------------------------------------------------------------------- + | Returns 1 if the extended double-precision floating-point value `a' is + | unnormal; otherwise returns 0. + *----------------------------------------------------------------------------*/ + +flag floatx80_is_unnormal( floatx80 a ) +{ + + return + ( ( a.high & 0x7FFF ) > 0 ) + && ( ( a.high & 0x7FFF ) < 0x7FFF) + && ( (bits64) ( a.low & LIT64( 0x8000000000000000 ) ) == LIT64( 0x0000000000000000 ) ); + +} + +/*---------------------------------------------------------------------------- + | Returns 1 if the extended double-precision floating-point value `a' is + | normal; otherwise returns 0. + *----------------------------------------------------------------------------*/ + +flag floatx80_is_normal( floatx80 a ) +{ + + return + ( ( a.high & 0x7FFF ) < 0x7FFF ) + && ( (bits64) ( a.low & LIT64( 0x8000000000000000 ) ) == LIT64( 0x8000000000000000 ) ); + +} + +// End of addition for Previous + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point NaN `a' to the canonical NaN format. If `a' is a signaling NaN, the +| invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT floatx80ToCommonNaN( floatx80 a ) +{ + commonNaNT z; + + if ( floatx80_is_signaling_nan( a ) ) float_raise( float_flag_invalid ); + z.sign = a.high>>15; + z.low = 0; + z.high = a.low<<1; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the extended +| double-precision floating-point format. +*----------------------------------------------------------------------------*/ + +static floatx80 commonNaNToFloatx80( commonNaNT a ) +{ + floatx80 z; + + z.low = LIT64( 0xC000000000000000 ) | ( a.high>>1 ); + z.high = ( ( (bits16) a.sign )<<15 ) | 0x7FFF; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes two extended double-precision floating-point values `a' and `b', one +| of which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +floatx80 propagateFloatx80NaN( floatx80 a, floatx80 b ) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = floatx80_is_nan( a ); + aIsSignalingNaN = floatx80_is_signaling_nan( a ); + bIsNaN = floatx80_is_nan( b ); + bIsSignalingNaN = floatx80_is_signaling_nan( b ); + a.low |= LIT64( 0xC000000000000000 ); + b.low |= LIT64( 0xC000000000000000 ); + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid ); + if ( aIsNaN ) { + return ( aIsSignalingNaN & bIsNaN ) ? b : a; + } + else { + return b; + } + +} + +#define EXP_BIAS 0x3FFF + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +bits64 extractFloatx80Frac( floatx80 a ) +{ + + return a.low; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the extended double-precision floating-point +| value `a'. +*----------------------------------------------------------------------------*/ + +int32 extractFloatx80Exp( floatx80 a ) +{ + + return a.high & 0x7FFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the extended double-precision floating-point value +| `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloatx80Sign( floatx80 a ) +{ + + return a.high>>15; + +} + +#endif + +#ifdef FLOATX128 + +/*---------------------------------------------------------------------------- +| The pattern for a default generated quadruple-precision NaN. The `high' and +| `low' values hold the most- and least-significant bits, respectively. +*----------------------------------------------------------------------------*/ +#define float128_default_nan_high LIT64( 0xFFFFFFFFFFFFFFFF ) +#define float128_default_nan_low LIT64( 0xFFFFFFFFFFFFFFFF ) + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is a NaN; +| otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float128_is_nan( float128 a ) +{ + + return + ( LIT64( 0xFFFE000000000000 ) <= (bits64) ( a.high<<1 ) ) + && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is a +| signaling NaN; otherwise returns 0. +*----------------------------------------------------------------------------*/ + +flag float128_is_signaling_nan( float128 a ) +{ + + return + ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE ) + && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point NaN +| `a' to the canonical NaN format. If `a' is a signaling NaN, the invalid +| exception is raised. +*----------------------------------------------------------------------------*/ + +static commonNaNT float128ToCommonNaN( float128 a ) +{ + commonNaNT z; + + if ( float128_is_signaling_nan( a ) ) float_raise( float_flag_invalid ); + z.sign = a.high>>63; + shortShift128Left( a.high, a.low, 16, &z.high, &z.low ); + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the canonical NaN `a' to the quadruple- +| precision floating-point format. +*----------------------------------------------------------------------------*/ + +static float128 commonNaNToFloat128( commonNaNT a ) +{ + float128 z; + + shift128Right( a.high, a.low, 16, &z.high, &z.low ); + z.high |= ( ( (bits64) a.sign )<<63 ) | LIT64( 0x7FFF800000000000 ); + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes two quadruple-precision floating-point values `a' and `b', one of +| which is a NaN, and returns the appropriate NaN result. If either `a' or +| `b' is a signaling NaN, the invalid exception is raised. +*----------------------------------------------------------------------------*/ + +static float128 propagateFloat128NaN( float128 a, float128 b ) +{ + flag aIsNaN, aIsSignalingNaN, bIsNaN, bIsSignalingNaN; + + aIsNaN = float128_is_nan( a ); + aIsSignalingNaN = float128_is_signaling_nan( a ); + bIsNaN = float128_is_nan( b ); + bIsSignalingNaN = float128_is_signaling_nan( b ); + a.high |= LIT64( 0x0000800000000000 ); + b.high |= LIT64( 0x0000800000000000 ); + if ( aIsSignalingNaN | bIsSignalingNaN ) float_raise( float_flag_invalid ); + if ( aIsNaN ) { + return ( aIsSignalingNaN & bIsNaN ) ? b : a; + } + else { + return b; + } + +} + +#endif + diff --git a/softfloat/softfloat.cpp b/softfloat/softfloat.cpp new file mode 100644 index 00000000..acc821aa --- /dev/null +++ b/softfloat/softfloat.cpp @@ -0,0 +1,5099 @@ + +/*============================================================================ + +This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#include "milieu.h" +#include "softfloat.h" + +/*---------------------------------------------------------------------------- +| Floating-point rounding mode, extended double-precision rounding precision, +| and exception flags. +*----------------------------------------------------------------------------*/ +int8 float_exception_flags = 0; +#ifdef FLOATX80 +int8 floatx80_rounding_precision = 80; +#endif + +int8 float_rounding_mode = float_round_nearest_even; + +/*---------------------------------------------------------------------------- +| Functions and definitions to determine: (1) whether tininess for underflow +| is detected before or after rounding by default, (2) what (if anything) +| happens when exceptions are raised, (3) how signaling NaNs are distinguished +| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs +| are propagated from function inputs to output. These details are target- +| specific. +*----------------------------------------------------------------------------*/ +#include "softfloat-specialize.h" + +/*---------------------------------------------------------------------------- +| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 +| and 7, and returns the properly rounded 32-bit integer corresponding to the +| input. If `zSign' is 1, the input is negated before being converted to an +| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input +| is simply rounded to an integer, with the inexact exception raised if the +| input cannot be represented exactly as an integer. However, if the fixed- +| point input is too large, the invalid exception is raised and the largest +| positive or negative integer is returned. +*----------------------------------------------------------------------------*/ + +static int32 roundAndPackInt32( flag zSign, bits64 absZ ) +{ + int8 roundingMode; + flag roundNearestEven; + int8 roundIncrement, roundBits; + int32 z; + + roundingMode = float_rounding_mode; + roundNearestEven = ( roundingMode == float_round_nearest_even ); + roundIncrement = 0x40; + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + roundIncrement = 0; + } + else { + roundIncrement = 0x7F; + if ( zSign ) { + if ( roundingMode == float_round_up ) roundIncrement = 0; + } + else { + if ( roundingMode == float_round_down ) roundIncrement = 0; + } + } + } + roundBits = absZ & 0x7F; + absZ = ( absZ + roundIncrement )>>7; + absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); + z = absZ; + if ( zSign ) z = - z; + z = (sbits32) z; + if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { + float_raise( float_flag_invalid ); + return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; + } + if ( roundBits ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and +| `absZ1', with binary point between bits 63 and 64 (between the input words), +| and returns the properly rounded 64-bit integer corresponding to the input. +| If `zSign' is 1, the input is negated before being converted to an integer. +| Ordinarily, the fixed-point input is simply rounded to an integer, with +| the inexact exception raised if the input cannot be represented exactly as +| an integer. However, if the fixed-point input is too large, the invalid +| exception is raised and the largest positive or negative integer is +| returned. +*----------------------------------------------------------------------------*/ + +static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 ) +{ + int8 roundingMode; + flag roundNearestEven, increment; + int64 z; + + roundingMode = float_rounding_mode; + roundNearestEven = ( roundingMode == float_round_nearest_even ); + increment = ( (sbits64) absZ1 < 0 ); + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + increment = 0; + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && absZ1; + } + else { + increment = ( roundingMode == float_round_up ) && absZ1; + } + } + } + if ( increment ) { + ++absZ0; + if ( absZ0 == 0 ) goto overflow; + absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); + } + z = absZ0; + if ( zSign ) z = - z; + z = (sbits64) z; + if ( z && ( ( z < 0 ) ^ zSign ) ) { + overflow: + float_raise( float_flag_invalid ); + return + zSign ? (sbits64) LIT64( 0x8000000000000000 ) + : LIT64( 0x7FFFFFFFFFFFFFFF ); + } + if ( absZ1 ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the fraction bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE bits32 extractFloat32Frac( float32 a ) +{ + return a & 0x007FFFFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the exponent bits of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE int16 extractFloat32Exp( float32 a ) +{ + return ( a>>23 ) & 0xFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the single-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat32Sign( float32 a ) +{ + return a>>31; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal single-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros32( aSig ) - 8; + *zSigPtr = aSig<>7; + zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); + if ( zSig == 0 ) zExp = 0; + return packFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper single-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat32' except that `zSig' does not have to be normalized. +| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +static float32 + normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros32( zSig ) - 1; + return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<>52 ) & 0x7FF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the double-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat64Sign( float64 a ) +{ + return a>>63; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal double-precision floating-point value represented +| by the denormalized significand `aSig'. The normalized exponent and +| significand are stored at the locations pointed to by `zExpPtr' and +| `zSigPtr', respectively. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64( aSig ) - 11; + *zSigPtr = aSig<>10; + zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); + if ( zSig == 0 ) zExp = 0; + return packFloat64( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand `zSig', and returns the proper double-precision floating- +| point value corresponding to the abstract input. This routine is just like +| `roundAndPackFloat64' except that `zSig' does not have to be normalized. +| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' +| floating-point exponent. +*----------------------------------------------------------------------------*/ + +static float64 + normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig ) +{ + int8 shiftCount; + + shiftCount = countLeadingZeros64( zSig ) - 1; + return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<>48 ) & 0x7FFF; + +} + +/*---------------------------------------------------------------------------- +| Returns the sign bit of the quadruple-precision floating-point value `a'. +*----------------------------------------------------------------------------*/ + +INLINE flag extractFloat128Sign( float128 a ) +{ + return a.high>>63; + +} + +/*---------------------------------------------------------------------------- +| Normalizes the subnormal quadruple-precision floating-point value +| represented by the denormalized significand formed by the concatenation of +| `aSig0' and `aSig1'. The normalized exponent is stored at the location +| pointed to by `zExpPtr'. The most significant 49 bits of the normalized +| significand are stored at the location pointed to by `zSig0Ptr', and the +| least significant 64 bits of the normalized significand are stored at the +| location pointed to by `zSig1Ptr'. +*----------------------------------------------------------------------------*/ + +static void + normalizeFloat128Subnormal( + bits64 aSig0, + bits64 aSig1, + int32 *zExpPtr, + bits64 *zSig0Ptr, + bits64 *zSig1Ptr + ) +{ + int8 shiftCount; + + if ( aSig0 == 0 ) { + shiftCount = countLeadingZeros64( aSig1 ) - 15; + if ( shiftCount < 0 ) { + *zSig0Ptr = aSig1>>( - shiftCount ); + *zSig1Ptr = aSig1<<( shiftCount & 63 ); + } + else { + *zSig0Ptr = aSig1<>( - shiftCount ); + if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { + float_exception_flags |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int64 float32_to_int64( float32 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits32 aSig; + bits64 aSig64, aSigExtra; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = 0xBE - aExp; + if ( shiftCount < 0 ) { + float_raise( float_flag_invalid ); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + return (sbits64) LIT64( 0x8000000000000000 ); + } + if ( aExp ) aSig |= 0x00800000; + aSig64 = aSig; + aSig64 <<= 40; + shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); + return roundAndPackInt64( aSign, aSig64, aSigExtra ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the 64-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. If +| `a' is a NaN, the largest positive integer is returned. Otherwise, if the +| conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int64 float32_to_int64_round_to_zero( float32 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits32 aSig; + bits64 aSig64; + int64 z; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + shiftCount = aExp - 0xBE; + if ( 0 <= shiftCount ) { + if ( a != 0xDF000000 ) { + float_raise( float_flag_invalid ); + if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { + return LIT64( 0x7FFFFFFFFFFFFFFF ); + } + } + return (sbits64) LIT64( 0x8000000000000000 ); + } + else if ( aExp <= 0x7E ) { + if ( aExp | aSig ) float_exception_flags |= float_flag_inexact; + return 0; + } + aSig64 = aSig | 0x00800000; + aSig64 <<= 40; + z = aSig64>>( - shiftCount ); + if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { + float_exception_flags |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the double-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float32_to_float64( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 aSig; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) ); + return packFloat64( aSign, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); + +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float32_to_floatx80( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 aSig; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + aSig |= 0x00800000; + return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); + +} + +// 31-12-2016: Added for Previous +floatx80 float32_to_floatx80_allowunnormal( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 aSig; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) ); + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + return packFloatx80( aSign, 0x3F81, ( (bits64) aSig )<<40 ); + } + aSig |= 0x00800000; + return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); + +} +// end of addition for Previous + +#endif + +#ifdef FLOATX128 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the single-precision floating-point value +| `a' to the double-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float32_to_float128( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 aSig; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) ); + return packFloat128( aSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); + +} + +#endif + +/*---------------------------------------------------------------------------- +| Rounds the single-precision floating-point value `a' to an integer, and +| returns the result as a single-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_round_to_int( float32 a ) +{ + flag aSign; + int16 aExp; + bits32 lastBitMask, roundBitsMask; + int8 roundingMode; + float32 z; + + aExp = extractFloat32Exp( a ); + if ( 0x96 <= aExp ) { + if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { + return propagateFloat32NaN( a, a ); + } + return a; + } + if ( aExp <= 0x7E ) { + if ( (bits32) ( a<<1 ) == 0 ) return a; + float_exception_flags |= float_flag_inexact; + aSign = extractFloat32Sign( a ); + switch ( float_rounding_mode ) { + case float_round_nearest_even: + if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { + return packFloat32( aSign, 0x7F, 0 ); + } + break; + case float_round_down: + return aSign ? 0xBF800000 : 0; + case float_round_up: + return aSign ? 0x80000000 : 0x3F800000; + } + return packFloat32( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x96 - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + z += lastBitMask>>1; + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) { + z += roundBitsMask; + } + } + z &= ~ roundBitsMask; + if ( z != a ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the single-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 addFloat32Sigs( float32 a, float32 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig; + int16 expDiff; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + expDiff = aExp - bExp; + aSig <<= 6; + bSig <<= 6; + if ( 0 < expDiff ) { + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= 0x20000000; + } + shift32RightJamming( bSig, expDiff, &bSig ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return packFloat32( zSign, 0xFF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= 0x20000000; + } + shift32RightJamming( aSig, - expDiff, &aSig ); + zExp = bExp; + } + else { + if ( aExp == 0xFF ) { + if ( aSig | bSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); + zSig = 0x40000000 + aSig + bSig; + zExp = aExp; + goto roundAndPack; + } + aSig |= 0x20000000; + zSig = ( aSig + bSig )<<1; + --zExp; + if ( (sbits32) zSig < 0 ) { + zSig = aSig + bSig; + ++zExp; + } + roundAndPack: + return roundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the single- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float32 subFloat32Sigs( float32 a, float32 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig; + int16 expDiff; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + expDiff = aExp - bExp; + aSig <<= 7; + bSig <<= 7; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0xFF ) { + if ( aSig | bSig ) return propagateFloat32NaN( a, b ); + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloat32( float_rounding_mode == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return packFloat32( zSign ^ 1, 0xFF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= 0x40000000; + } + shift32RightJamming( aSig, - expDiff, &aSig ); + bSig |= 0x40000000; + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= 0x40000000; + } + shift32RightJamming( bSig, expDiff, &bSig ); + aSig |= 0x40000000; + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the single-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_add( float32 a, float32 b ) +{ + flag aSign, bSign; + + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign == bSign ) { + return addFloat32Sigs( a, b, aSign ); + } + else { + return subFloat32Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sub( float32 a, float32 b ) +{ + flag aSign, bSign; + + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign == bSign ) { + return subFloat32Sigs( a, b, aSign ); + } + else { + return addFloat32Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the single-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_mul( float32 a, float32 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits32 aSig, bSig; + bits64 zSig64; + bits32 zSig; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0xFF ) { + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { + return propagateFloat32NaN( a, b ); + } + if ( ( bExp | bSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x7F; + aSig = ( aSig | 0x00800000 )<<7; + bSig = ( bSig | 0x00800000 )<<8; + shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); + zSig = zSig64; + if ( 0 <= (sbits32) ( zSig<<1 ) ) { + zSig <<= 1; + --zExp; + } + return roundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the single-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_div( float32 a, float32 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits32 aSig, bSig, zSig; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); + bSign = extractFloat32Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, b ); + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + float_raise( float_flag_invalid ); + return float32_default_nan; + } + return packFloat32( zSign, 0xFF, 0 ); + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return packFloat32( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + float_raise( float_flag_divbyzero ); + return packFloat32( zSign, 0xFF, 0 ); + } + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x7D; + aSig = ( aSig | 0x00800000 )<<7; + bSig = ( bSig | 0x00800000 )<<8; + if ( bSig <= ( aSig + aSig ) ) { + aSig >>= 1; + ++zExp; + } + zSig = ( ( (bits64) aSig )<<32 ) / bSig; + if ( ( zSig & 0x3F ) == 0 ) { + zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); + } + return roundAndPackFloat32( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the single-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_rem( float32 a, float32 b ) +{ + flag aSign, zSign; + int16 aExp, bExp, expDiff; + bits32 aSig, bSig; + bits32 q; + bits64 aSig64, bSig64, q64; + bits32 alternateASig; + sbits32 sigMean; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + bSig = extractFloat32Frac( b ); + bExp = extractFloat32Exp( b ); +// bSign = extractFloat32Sign( b ); + if ( aExp == 0xFF ) { + if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { + return propagateFloat32NaN( a, b ); + } + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( bExp == 0xFF ) { + if ( bSig ) return propagateFloat32NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + float_raise( float_flag_invalid ); + return float32_default_nan; + } + normalizeFloat32Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return a; + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + expDiff = aExp - bExp; + aSig |= 0x00800000; + bSig |= 0x00800000; + if ( expDiff < 32 ) { + aSig <<= 8; + bSig <<= 8; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + aSig >>= 1; + } + q = ( bSig <= aSig ); + if ( q ) aSig -= bSig; + if ( 0 < expDiff ) { + q = ( ( (bits64) aSig )<<32 ) / bSig; + q >>= 32 - expDiff; + bSig >>= 2; + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; + } + else { + aSig >>= 2; + bSig >>= 2; + } + } + else { + if ( bSig <= aSig ) aSig -= bSig; + aSig64 = ( (bits64) aSig )<<40; + bSig64 = ( (bits64) bSig )<<40; + expDiff -= 64; + while ( 0 < expDiff ) { + q64 = estimateDiv128To64( aSig64, 0, bSig64 ); + q64 = ( 2 < q64 ) ? q64 - 2 : 0; + aSig64 = - ( ( bSig * q64 )<<38 ); + expDiff -= 62; + } + expDiff += 64; + q64 = estimateDiv128To64( aSig64, 0, bSig64 ); + q64 = ( 2 < q64 ) ? q64 - 2 : 0; + q = q64>>( 64 - expDiff ); + bSig <<= 6; + aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; + } + do { + alternateASig = aSig; + ++q; + aSig -= bSig; + } while ( 0 <= (sbits32) aSig ); + sigMean = aSig + alternateASig; + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { + aSig = alternateASig; + } + zSign = ( (sbits32) aSig < 0 ); + if ( zSign ) aSig = - aSig; + return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the single-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float32_sqrt( float32 a ) +{ + flag aSign; + int16 aExp, zExp; + bits32 aSig, zSig; + bits64 rem, term; + + aSig = extractFloat32Frac( a ); + aExp = extractFloat32Exp( a ); + aSign = extractFloat32Sign( a ); + if ( aExp == 0xFF ) { + if ( aSig ) return propagateFloat32NaN( a, 0 ); + if ( ! aSign ) return a; + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( aSign ) { + if ( ( aExp | aSig ) == 0 ) return a; + float_raise( float_flag_invalid ); + return float32_default_nan; + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return 0; + normalizeFloat32Subnormal( aSig, &aExp, &aSig ); + } + zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; + aSig = ( aSig | 0x00800000 )<<8; + zSig = estimateSqrt32( aExp, aSig ) + 2; + if ( ( zSig & 0x7F ) <= 5 ) { + if ( zSig < 2 ) { + zSig = 0x7FFFFFFF; + goto roundAndPack; + } + aSig >>= aExp & 1; + term = ( (bits64) zSig ) * zSig; + rem = ( ( (bits64) aSig )<<32 ) - term; + while ( (sbits64) rem < 0 ) { + --zSig; + rem += ( ( (bits64) zSig )<<1 ) | 1; + } + zSig |= ( rem != 0 ); + } + shift32RightJamming( zSig, 1, &zSig ); + roundAndPack: + return roundAndPackFloat32( 0, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_eq( float32 a, float32 b ) +{ + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. The comparison +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_le( float32 a, float32 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); + return ( a == b ) || ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_lt( float32 a, float32 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); + return ( a != b ) && ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_eq_signaling( float32 a, float32 b ) +{ + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_le_quiet( float32 a, float32 b ) +{ + flag aSign, bSign; +// int16 aExp, bExp; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 ); + return ( a == b ) || ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the single-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float32_lt_quiet( float32 a, float32 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) + || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) + ) { + if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat32Sign( a ); + bSign = extractFloat32Sign( b ); + if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 ); + return ( a != b ) && ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 float64_to_int32( float64 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits64 aSig; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; + if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); + shiftCount = 0x42C - aExp; + if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); + return roundAndPackInt32( aSign, aSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the 32-bit two's complement integer format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. +| If `a' is a NaN, the largest positive integer is returned. Otherwise, if +| the conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float64_to_int32_round_to_zero( float64 a ) +{ + flag aSign; + int16 aExp, shiftCount; + bits64 aSig, savedASig; + int32 z; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( 0x41E < aExp ) { + if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; + goto invalid; + } + else if ( aExp < 0x3FF ) { + if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; + return 0; + } + aSig |= LIT64( 0x0010000000000000 ); + shiftCount = 0x433 - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = aSig; + if ( aSign ) z = - z; + z = (sbits32) z; + if ( ( z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid ); + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; + } + if ( ( aSig<>( - shiftCount ); + if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { + float_exception_flags |= float_flag_inexact; + } + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the single-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float64_to_float32( float64 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig; + bits32 zSig; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) ); + return packFloat32( aSign, 0xFF, 0 ); + } + shift64RightJamming( aSig, 22, &aSig ); + zSig = aSig; + if ( aExp || zSig ) { + zSig |= 0x40000000; + aExp -= 0x381; + } + return roundAndPackFloat32( aSign, aExp, zSig ); + +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the extended double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float64_to_floatx80( float64 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + return + packFloatx80( + aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); + +} + +// 31-12-2016: Added for Previous +floatx80 float64_to_floatx80_allowunnormal( float64 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) ); + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + return packFloatx80( aSign, 0x3C01, aSig<<11); + } + return + packFloatx80( + aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); + +} +// end of addition for Previous + +#endif + +#ifdef FLOATX128 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the double-precision floating-point value +| `a' to the quadruple-precision floating-point format. The conversion is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float64_to_float128( float64 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig, zSig0, zSig1; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) ); + return packFloat128( aSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + --aExp; + } + shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); + return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); + +} + +#endif + +/*---------------------------------------------------------------------------- +| Rounds the double-precision floating-point value `a' to an integer, and +| returns the result as a double-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_round_to_int( float64 a ) +{ + flag aSign; + int16 aExp; + bits64 lastBitMask, roundBitsMask; + int8 roundingMode; + float64 z; + + aExp = extractFloat64Exp( a ); + if ( 0x433 <= aExp ) { + if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { + return propagateFloat64NaN( a, a ); + } + return a; + } + if ( aExp < 0x3FF ) { + if ( (bits64) ( a<<1 ) == 0 ) return a; + float_exception_flags |= float_flag_inexact; + aSign = extractFloat64Sign( a ); + switch ( float_rounding_mode ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { + return packFloat64( aSign, 0x3FF, 0 ); + } + break; + case float_round_down: + return aSign ? LIT64( 0xBFF0000000000000 ) : 0; + case float_round_up: + return + aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ); + } + return packFloat64( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x433 - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + z += lastBitMask>>1; + if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) { + z += roundBitsMask; + } + } + z &= ~ roundBitsMask; + if ( z != a ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the double-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 addFloat64Sigs( float64 a, float64 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits64 aSig, bSig, zSig; + int16 expDiff; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + expDiff = aExp - bExp; + aSig <<= 9; + bSig <<= 9; + if ( 0 < expDiff ) { + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= LIT64( 0x2000000000000000 ); + } + shift64RightJamming( bSig, expDiff, &bSig ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b ); + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= LIT64( 0x2000000000000000 ); + } + shift64RightJamming( aSig, - expDiff, &aSig ); + zExp = bExp; + } + else { + if ( aExp == 0x7FF ) { + if ( aSig | bSig ) return propagateFloat64NaN( a, b ); + return a; + } + if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); + zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; + zExp = aExp; + goto roundAndPack; + } + aSig |= LIT64( 0x2000000000000000 ); + zSig = ( aSig + bSig )<<1; + --zExp; + if ( (sbits64) zSig < 0 ) { + zSig = aSig + bSig; + ++zExp; + } + roundAndPack: + return roundAndPackFloat64( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float64 subFloat64Sigs( float64 a, float64 b, flag zSign ) +{ + int16 aExp, bExp, zExp; + bits64 aSig, bSig, zSig; + int16 expDiff; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + expDiff = aExp - bExp; + aSig <<= 10; + bSig <<= 10; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FF ) { + if ( aSig | bSig ) return propagateFloat64NaN( a, b ); + float_raise( float_flag_invalid ); + return float64_default_nan; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloat64( float_rounding_mode == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b ); + return packFloat64( zSign ^ 1, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig |= LIT64( 0x4000000000000000 ); + } + shift64RightJamming( aSig, - expDiff, &aSig ); + bSig |= LIT64( 0x4000000000000000 ); + bBigger: + zSig = bSig - aSig; + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig |= LIT64( 0x4000000000000000 ); + } + shift64RightJamming( bSig, expDiff, &bSig ); + aSig |= LIT64( 0x4000000000000000 ); + aBigger: + zSig = aSig - bSig; + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat64( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the double-precision floating-point values `a' +| and `b'. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_add( float64 a, float64 b ) +{ + flag aSign, bSign; + + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign == bSign ) { + return addFloat64Sigs( a, b, aSign ); + } + else { + return subFloat64Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sub( float64 a, float64 b ) +{ + flag aSign, bSign; + + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign == bSign ) { + return subFloat64Sigs( a, b, aSign ); + } + else { + return addFloat64Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the double-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_mul( float64 a, float64 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits64 aSig, bSig, zSig0, zSig1; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FF ) { + if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { + return propagateFloat64NaN( a, b ); + } + if ( ( bExp | bSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float64_default_nan; + } + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b ); + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float64_default_nan; + } + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); + normalizeFloat64Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x3FF; + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; + mul64To128( aSig, bSig, &zSig0, &zSig1 ); + zSig0 |= ( zSig1 != 0 ); + if ( 0 <= (sbits64) ( zSig0<<1 ) ) { + zSig0 <<= 1; + --zExp; + } + return roundAndPackFloat64( zSign, zExp, zSig0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the double-precision floating-point value `a' +| by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_div( float64 a, float64 b ) +{ + flag aSign, bSign, zSign; + int16 aExp, bExp, zExp; + bits64 aSig, bSig, zSig; + bits64 rem0, rem1; + bits64 term0, term1; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); + bSign = extractFloat64Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, b ); + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b ); + float_raise( float_flag_invalid ); + return float64_default_nan; + } + return packFloat64( zSign, 0x7FF, 0 ); + } + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b ); + return packFloat64( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + float_raise( float_flag_invalid ); + return float64_default_nan; + } + float_raise( float_flag_divbyzero ); + return packFloat64( zSign, 0x7FF, 0 ); + } + normalizeFloat64Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x3FD; + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; + if ( bSig <= ( aSig + aSig ) ) { + aSig >>= 1; + ++zExp; + } + zSig = estimateDiv128To64( aSig, 0, bSig ); + if ( ( zSig & 0x1FF ) <= 2 ) { + mul64To128( bSig, zSig, &term0, &term1 ); + sub128( aSig, 0, term0, term1, &rem0, &rem1 ); + while ( (sbits64) rem0 < 0 ) { + --zSig; + add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); + } + zSig |= ( rem1 != 0 ); + } + return roundAndPackFloat64( zSign, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the double-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_rem( float64 a, float64 b ) +{ + flag aSign, zSign; + int16 aExp, bExp, expDiff; + bits64 aSig, bSig; + bits64 q, alternateASig; + sbits64 sigMean; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + bSig = extractFloat64Frac( b ); + bExp = extractFloat64Exp( b ); +// bSign = extractFloat64Sign( b ); + if ( aExp == 0x7FF ) { + if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { + return propagateFloat64NaN( a, b ); + } + float_raise( float_flag_invalid ); + return float64_default_nan; + } + if ( bExp == 0x7FF ) { + if ( bSig ) return propagateFloat64NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + float_raise( float_flag_invalid ); + return float64_default_nan; + } + normalizeFloat64Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return a; + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + expDiff = aExp - bExp; + aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; + bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + aSig >>= 1; + } + q = ( bSig <= aSig ); + if ( q ) aSig -= bSig; + expDiff -= 64; + while ( 0 < expDiff ) { + q = estimateDiv128To64( aSig, 0, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + aSig = - ( ( bSig>>2 ) * q ); + expDiff -= 62; + } + expDiff += 64; + if ( 0 < expDiff ) { + q = estimateDiv128To64( aSig, 0, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + q >>= 64 - expDiff; + bSig >>= 2; + aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; + } + else { + aSig >>= 2; + bSig >>= 2; + } + do { + alternateASig = aSig; + ++q; + aSig -= bSig; + } while ( 0 <= (sbits64) aSig ); + sigMean = aSig + alternateASig; + if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { + aSig = alternateASig; + } + zSign = ( (sbits64) aSig < 0 ); + if ( zSign ) aSig = - aSig; + return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the double-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float64_sqrt( float64 a ) +{ + flag aSign; + int16 aExp, zExp; + bits64 aSig, zSig, doubleZSig; + bits64 rem0, rem1, term0, term1; +// float64 z; + + aSig = extractFloat64Frac( a ); + aExp = extractFloat64Exp( a ); + aSign = extractFloat64Sign( a ); + if ( aExp == 0x7FF ) { + if ( aSig ) return propagateFloat64NaN( a, a ); + if ( ! aSign ) return a; + float_raise( float_flag_invalid ); + return float64_default_nan; + } + if ( aSign ) { + if ( ( aExp | aSig ) == 0 ) return a; + float_raise( float_flag_invalid ); + return float64_default_nan; + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return 0; + normalizeFloat64Subnormal( aSig, &aExp, &aSig ); + } + zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; + aSig |= LIT64( 0x0010000000000000 ); + zSig = estimateSqrt32( aExp, aSig>>21 ); + aSig <<= 9 - ( aExp & 1 ); + zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); + if ( ( zSig & 0x1FF ) <= 5 ) { + doubleZSig = zSig<<1; + mul64To128( zSig, zSig, &term0, &term1 ); + sub128( aSig, 0, term0, term1, &rem0, &rem1 ); + while ( (sbits64) rem0 < 0 ) { + --zSig; + doubleZSig -= 2; + add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); + } + zSig |= ( ( rem0 | rem1 ) != 0 ); + } + return roundAndPackFloat64( 0, zExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is equal to the +| corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_eq( float64 a, float64 b ) +{ + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. The comparison is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_le( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); + return ( a == b ) || ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_lt( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); + return ( a != b ) && ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is equal to the +| corresponding value `b', and 0 otherwise. The invalid exception is raised +| if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_eq_signaling( float64 a, float64 b ) +{ + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than or +| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_le_quiet( float64 a, float64 b ) +{ + flag aSign, bSign; +// int16 aExp, bExp; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 ); + return ( a == b ) || ( aSign ^ ( a < b ) ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the double-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float64_lt_quiet( float64 a, float64 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) + || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) + ) { + if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat64Sign( a ); + bSign = extractFloat64Sign( b ); + if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 ); + return ( a != b ) && ( aSign ^ ( a < b ) ); + +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 32-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic---which means in particular that the conversion +| is rounded according to the current rounding mode. If `a' is a NaN, the +| largest positive integer is returned. Otherwise, if the conversion +| overflows, the largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 floatx80_to_int32( floatx80 a ) +{ + flag aSign; + int32 aExp, shiftCount; + bits64 aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; + shiftCount = 0x4037 - aExp; + if ( shiftCount <= 0 ) shiftCount = 1; + shift64RightJamming( aSig, shiftCount, &aSig ); + return roundAndPackInt32( aSign, aSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the 32-bit two's complement integer format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic, except that the conversion is always rounded +| toward zero. If `a' is a NaN, the largest positive integer is returned. +| Otherwise, if the conversion overflows, the largest integer with the same +| sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 floatx80_to_int32_round_to_zero( floatx80 a ) +{ + flag aSign; + int32 aExp, shiftCount; + bits64 aSig, savedASig; + int32 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( 0x401E < aExp ) { + if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; + goto invalid; + } + else if ( aExp < 0x3FFF ) { + if ( aExp || aSig ) float_exception_flags |= float_flag_inexact; + return 0; + } + shiftCount = 0x403E - aExp; + savedASig = aSig; + aSig >>= shiftCount; + z = aSig; + if ( aSign ) z = - z; + z = (sbits32) z; + if ( ( z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid ); + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; + } + if ( ( aSig<>( - shiftCount ); + if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { + float_exception_flags |= float_flag_inexact; + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the single-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 floatx80_to_float32( floatx80 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) { + return commonNaNToFloat32( floatx80ToCommonNaN( a ) ); + } + return packFloat32( aSign, 0xFF, 0 ); + } + shift64RightJamming( aSig, 33, &aSig ); + if ( aExp || aSig ) aExp -= 0x3F81; + return roundAndPackFloat32( aSign, aExp, aSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the double-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 floatx80_to_float64( floatx80 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig, zSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) { + return commonNaNToFloat64( floatx80ToCommonNaN( a ) ); + } + return packFloat64( aSign, 0x7FF, 0 ); + } + shift64RightJamming( aSig, 1, &zSig ); + if ( aExp || aSig ) aExp -= 0x3C01; + return roundAndPackFloat64( aSign, aExp, zSig ); + +} + +#ifdef FLOATX128 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the extended double-precision floating- +| point value `a' to the quadruple-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 floatx80_to_float128( floatx80 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig, zSig0, zSig1; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { + return commonNaNToFloat128( floatx80ToCommonNaN( a ) ); + } + shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); + return packFloat128( aSign, aExp, zSig0, zSig1 ); + +} + +#endif + +// 30-01-2016: Added for Previous + +floatx80 floatx80_normalize( floatx80 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + + if (aSig == 0) { + aExp = 0; + return packFloatx80( aSign, aExp, aSig ); + } + while ( (aSig & LIT64( 0x8000000000000000 ) ) == LIT64( 0x0000000000000000 ) ) { + if ( aExp == 0 ) { + float_raise( float_flag_denormal ); + break; + } + aSig = aSig << 1; + aExp--; + } + return packFloatx80( aSign, aExp, aSig ); + +} + +floatx80 floatx80_round32( floatx80 a ) +{ + flag aSign; + int16 aExp; + bits64 aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + + return roundAndPackFloatx80(32, aSign, aExp, aSig, 0); + +} +// end of addition for Previous + +/*---------------------------------------------------------------------------- +| Rounds the extended double-precision floating-point value `a' to an integer, +| and returns the result as an extended quadruple-precision floating-point +| value. The operation is performed according to the IEC/IEEE Standard for +| Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_round_to_int( floatx80 a ) +{ + flag aSign; + int32 aExp; + bits64 lastBitMask, roundBitsMask; + int8 roundingMode; + floatx80 z; + + aExp = extractFloatx80Exp( a ); + if ( 0x403E <= aExp ) { + if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { + return propagateFloatx80NaN( a, a ); + } + return a; + } + if ( aExp < 0x3FFF ) { + if ( ( aExp == 0 ) + && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { + return a; + } + float_exception_flags |= float_flag_inexact; + aSign = extractFloatx80Sign( a ); + switch ( float_rounding_mode ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) + ) { + return + packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); + } + break; + case float_round_down: + return + aSign ? + packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) + : packFloatx80( 0, 0, 0 ); + case float_round_up: + return + aSign ? packFloatx80( 1, 0, 0 ) + : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); + } + return packFloatx80( aSign, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x403E - aExp; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + z.low += lastBitMask>>1; + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { + z.low += roundBitsMask; + } + } + z.low &= ~ roundBitsMask; + if ( z.low == 0 ) { + ++z.high; + z.low = LIT64( 0x8000000000000000 ); + } + if ( z.low != a.low ) float_exception_flags |= float_flag_inexact; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the extended double- +| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is +| negated before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) +{ + int32 aExp, bExp, zExp; + bits64 aSig, bSig, zSig0, zSig1; + int32 expDiff; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) { + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); + return a; + } + if ( bExp == 0 ) --expDiff; + shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FFF ) { + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) ++expDiff; + shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); + zExp = bExp; + } + else { + if ( aExp == 0x7FFF ) { + if ( (bits64) ( ( aSig | bSig )<<1 ) ) { + return propagateFloatx80NaN( a, b ); + } + return a; + } + zSig1 = 0; + zSig0 = aSig + bSig; + if ( aExp == 0 ) { + normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); + goto roundAndPack; + } + zExp = aExp; + goto shiftRight1; + } + zSig0 = aSig + bSig; + if ( (sbits64) zSig0 < 0 ) goto roundAndPack; + shiftRight1: + shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); + zSig0 |= LIT64( 0x8000000000000000 ); + ++zExp; + roundAndPack: + return + roundAndPackFloatx80( + floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the extended +| double-precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign ) +{ + int32 aExp, bExp, zExp; + bits64 aSig, bSig, zSig0, zSig1; + int32 expDiff; + floatx80 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FFF ) { + if ( (bits64) ( ( aSig | bSig )<<1 ) ) { + return propagateFloatx80NaN( a, b ); + } + float_raise( float_flag_invalid ); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + zSig1 = 0; + if ( bSig < aSig ) goto aBigger; + if ( aSig < bSig ) goto bBigger; + return packFloatx80( float_rounding_mode == float_round_down, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FFF ) { + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); + return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) ++expDiff; + shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); + bBigger: + sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); + return a; + } + if ( bExp == 0 ) --expDiff; + shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); + aBigger: + sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); + zExp = aExp; + normalizeRoundAndPack: + return + normalizeRoundAndPackFloatx80( + floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the extended double-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_add( floatx80 a, floatx80 b ) +{ + flag aSign, bSign; + + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign == bSign ) { + return addFloatx80Sigs( a, b, aSign ); + } + else { + return subFloatx80Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the extended double-precision floating- +| point values `a' and `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_sub( floatx80 a, floatx80 b ) +{ + flag aSign, bSign; + + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign == bSign ) { + return subFloatx80Sigs( a, b, aSign ); + } + else { + return addFloatx80Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the extended double-precision floating- +| point values `a' and `b'. The operation is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_mul( floatx80 a, floatx80 b ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + bits64 aSig, bSig, zSig0, zSig1; + floatx80 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + bSign = extractFloatx80Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) + || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { + return propagateFloatx80NaN( a, b ); + } + if ( ( bExp | bSig ) == 0 ) goto invalid; + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( bExp == 0x7FFF ) { + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); + if ( ( aExp | aSig ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); + normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); + normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); + } + zExp = aExp + bExp - 0x3FFE; + mul64To128( aSig, bSig, &zSig0, &zSig1 ); + if ( 0 < (sbits64) zSig0 ) { + shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); + --zExp; + } + return + roundAndPackFloatx80( + floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the extended double-precision floating-point +| value `a' by the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_div( floatx80 a, floatx80 b ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + bits64 aSig, bSig, zSig0, zSig1; + bits64 rem0, rem1, rem2, term0, term1, term2; + floatx80 z; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); + bSign = extractFloatx80Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b ); + if ( bExp == 0x7FFF ) { + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); + goto invalid; + } + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( bExp == 0x7FFF ) { + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); + return packFloatx80( zSign, 0, 0 ); + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + if ( ( aExp | aSig ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + float_raise( float_flag_divbyzero ); + return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); + normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); + } + zExp = aExp - bExp + 0x3FFE; + rem1 = 0; + if ( bSig <= aSig ) { + shift128Right( aSig, 0, 1, &aSig, &rem1 ); + ++zExp; + } + zSig0 = estimateDiv128To64( aSig, rem1, bSig ); + mul64To128( bSig, zSig0, &term0, &term1 ); + sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); + while ( (sbits64) rem0 < 0 ) { + --zSig0; + add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); + } + zSig1 = estimateDiv128To64( rem1, 0, bSig ); + if ( (bits64) ( zSig1<<1 ) <= 8 ) { + mul64To128( bSig, zSig1, &term1, &term2 ); + sub128( rem1, 0, term1, term2, &rem1, &rem2 ); + while ( (sbits64) rem1 < 0 ) { + --zSig1; + add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); + } + zSig1 |= ( ( rem1 | rem2 ) != 0 ); + } + return + roundAndPackFloatx80( + floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the extended double-precision floating-point value +| `a' with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_rem( floatx80 a, floatx80 b ) +{ + flag aSign, zSign; + int32 aExp, bExp, expDiff; + bits64 aSig0, aSig1, bSig; + bits64 q, term0, term1, alternateASig0, alternateASig1; + floatx80 z; + + aSig0 = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + bSig = extractFloatx80Frac( b ); + bExp = extractFloatx80Exp( b ); +// bSign = extractFloatx80Sign( b ); + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig0<<1 ) + || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { + return propagateFloatx80NaN( a, b ); + } + goto invalid; + } + if ( bExp == 0x7FFF ) { + if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + if ( bSig == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); + } + if ( aExp == 0 ) { + if ( (bits64) ( aSig0<<1 ) == 0 ) return a; + normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); + } + bSig |= LIT64( 0x8000000000000000 ); + zSign = aSign; + expDiff = aExp - bExp; + aSig1 = 0; + if ( expDiff < 0 ) { + if ( expDiff < -1 ) return a; + shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); + expDiff = 0; + } + q = ( bSig <= aSig0 ); + if ( q ) aSig0 -= bSig; + expDiff -= 64; + while ( 0 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + mul64To128( bSig, q, &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); + shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); + expDiff -= 62; + } + expDiff += 64; + if ( 0 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig ); + q = ( 2 < q ) ? q - 2 : 0; + q >>= 64 - expDiff; + mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); + shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); + while ( le128( term0, term1, aSig0, aSig1 ) ) { + ++q; + sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); + } + } + else { + term1 = 0; + term0 = bSig; + } + sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); + if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) + || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) + && ( q & 1 ) ) + ) { + aSig0 = alternateASig0; + aSig1 = alternateASig1; + zSign = ! zSign; + } + return + normalizeRoundAndPackFloatx80( + 80, zSign, bExp + expDiff, aSig0, aSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the extended double-precision floating-point +| value `a'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 floatx80_sqrt( floatx80 a ) +{ + flag aSign; + int32 aExp, zExp; + bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; + bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; + floatx80 z; + + aSig0 = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a ); + if ( ! aSign ) return a; + goto invalid; + } + if ( aSign ) { + if ( ( aExp | aSig0 ) == 0 ) return a; + invalid: + float_raise( float_flag_invalid ); + z.low = floatx80_default_nan_low; + z.high = floatx80_default_nan_high; + return z; + } + if ( aExp == 0 ) { + if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); + normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); + } + zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; + zSig0 = estimateSqrt32( aExp, aSig0>>32 ); + shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); + zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); + doubleZSig0 = zSig0<<1; + mul64To128( zSig0, zSig0, &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); + while ( (sbits64) rem0 < 0 ) { + --zSig0; + doubleZSig0 -= 2; + add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); + } + zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); + if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { + if ( zSig1 == 0 ) zSig1 = 1; + mul64To128( doubleZSig0, zSig1, &term1, &term2 ); + sub128( rem1, 0, term1, term2, &rem1, &rem2 ); + mul64To128( zSig1, zSig1, &term2, &term3 ); + sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); + while ( (sbits64) rem1 < 0 ) { + --zSig1; + shortShift128Left( 0, zSig1, 1, &term2, &term3 ); + term3 |= 1; + term2 |= doubleZSig0; + add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); + zSig0 |= doubleZSig0; + return + roundAndPackFloatx80( + floatx80_rounding_precision, 0, zExp, zSig0, zSig1 ); + +} + +// 07-01-2017: Added for Previous +/*---------------------------------------------------------------------------- + | Returns the mantissa of the extended double-precision floating-point + | value `a'. + *----------------------------------------------------------------------------*/ + +floatx80 floatx80_getman( floatx80 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, a ); + float_raise( float_flag_invalid ); + a.low = floatx80_default_nan_low; + a.high = floatx80_default_nan_high; + return a; + } + + if ( aExp == 0 ) { + if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); + } + + return roundAndPackFloatx80(floatx80_rounding_precision, aSign, 0x3FFF, aSig, 0); +} + +/*---------------------------------------------------------------------------- + | Returns the exponent of the extended double-precision floating-point + | value `a' as an extended double-precision value. + *----------------------------------------------------------------------------*/ + +floatx80 floatx80_getexp( floatx80 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig; + + aSig = extractFloatx80Frac( a ); + aExp = extractFloatx80Exp( a ); + aSign = extractFloatx80Sign( a ); + + if ( aExp == 0x7FFF ) { + if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, a ); + float_raise( float_flag_invalid ); + a.low = floatx80_default_nan_low; + a.high = floatx80_default_nan_high; + return a; + } + + if (aExp == 0 && aSig == 0) return packFloatx80(aSign, 0, 0); + + return int32_to_floatx80(aExp - 0x3FFF); +} +// End of addition for Previous + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is +| equal to the corresponding value `b', and 0 otherwise. The comparison is +| performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag floatx80_eq( floatx80 a, floatx80 b ) +{ + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is +| less than or equal to the corresponding value `b', and 0 otherwise. The +| comparison is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag floatx80_le( floatx80 a, floatx80 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is +| less than the corresponding value `b', and 0 otherwise. The comparison +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag floatx80_lt( floatx80 a, floatx80 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is equal +| to the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag floatx80_eq_signaling( floatx80 a, floatx80 b ) +{ + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( b )<<1 ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is less +| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs +| do not cause an exception. Otherwise, the comparison is performed according +| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag floatx80_le_quiet( floatx80 a, floatx80 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the extended double-precision floating-point value `a' is less +| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause +| an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag floatx80_lt_quiet( floatx80 a, floatx80 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( a )<<1 ) ) + || ( ( extractFloatx80Exp( b ) == 0x7FFF ) + && (bits64) ( extractFloatx80Frac( b )<<1 ) ) + ) { + if ( floatx80_is_signaling_nan( a ) + || floatx80_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloatx80Sign( a ); + bSign = extractFloatx80Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +#endif + +#ifdef FLOATX128 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the 32-bit two's complement integer format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic---which means in particular that the conversion is rounded +| according to the current rounding mode. If `a' is a NaN, the largest +| positive integer is returned. Otherwise, if the conversion overflows, the +| largest integer with the same sign as `a' is returned. +*----------------------------------------------------------------------------*/ + +int32 float128_to_int32( float128 a ) +{ + flag aSign; + int32 aExp, shiftCount; + bits64 aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; + if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); + aSig0 |= ( aSig1 != 0 ); + shiftCount = 0x4028 - aExp; + if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); + return roundAndPackInt32( aSign, aSig0 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the 32-bit two's complement integer format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic, except that the conversion is always rounded toward zero. If +| `a' is a NaN, the largest positive integer is returned. Otherwise, if the +| conversion overflows, the largest integer with the same sign as `a' is +| returned. +*----------------------------------------------------------------------------*/ + +int32 float128_to_int32_round_to_zero( float128 a ) +{ + flag aSign; + int32 aExp, shiftCount; + bits64 aSig0, aSig1, savedASig; + int32 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + aSig0 |= ( aSig1 != 0 ); + if ( 0x401E < aExp ) { + if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; + goto invalid; + } + else if ( aExp < 0x3FFF ) { + if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact; + return 0; + } + aSig0 |= LIT64( 0x0001000000000000 ); + shiftCount = 0x402F - aExp; + savedASig = aSig0; + aSig0 >>= shiftCount; + z = aSig0; + if ( aSign ) z = - z; + z = (sbits32) z; + if ( ( z < 0 ) ^ aSign ) { + invalid: + float_raise( float_flag_invalid ); + return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; + } + if ( ( aSig0<>( ( - shiftCount ) & 63 ) ); + if ( (bits64) ( aSig1<>( - shiftCount ); + if ( aSig1 + || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { + float_exception_flags |= float_flag_inexact; + } + } + if ( aSign ) z = - z; + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the single-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float32 float128_to_float32( float128 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig0, aSig1; + bits32 zSig; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloat32( float128ToCommonNaN( a ) ); + } + return packFloat32( aSign, 0xFF, 0 ); + } + aSig0 |= ( aSig1 != 0 ); + shift64RightJamming( aSig0, 18, &aSig0 ); + zSig = aSig0; + if ( aExp || zSig ) { + zSig |= 0x40000000; + aExp -= 0x3F81; + } + return roundAndPackFloat32( aSign, aExp, zSig ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the double-precision floating-point format. The conversion +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +float64 float128_to_float64( float128 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloat64( float128ToCommonNaN( a ) ); + } + return packFloat64( aSign, 0x7FF, 0 ); + } + shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); + aSig0 |= ( aSig1 != 0 ); + if ( aExp || aSig0 ) { + aSig0 |= LIT64( 0x4000000000000000 ); + aExp -= 0x3C01; + } + return roundAndPackFloat64( aSign, aExp, aSig0 ); + +} + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Returns the result of converting the quadruple-precision floating-point +| value `a' to the extended double-precision floating-point format. The +| conversion is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +floatx80 float128_to_floatx80( float128 a ) +{ + flag aSign; + int32 aExp; + bits64 aSig0, aSig1; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) { + return commonNaNToFloatx80( float128ToCommonNaN( a ) ); + } + return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + else { + aSig0 |= LIT64( 0x0001000000000000 ); + } + shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); + return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 ); + +} + +#endif + +/*---------------------------------------------------------------------------- +| Rounds the quadruple-precision floating-point value `a' to an integer, and +| returns the result as a quadruple-precision floating-point value. The +| operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_round_to_int( float128 a ) +{ + flag aSign; + int32 aExp; + bits64 lastBitMask, roundBitsMask; + int8 roundingMode; + float128 z; + + aExp = extractFloat128Exp( a ); + if ( 0x402F <= aExp ) { + if ( 0x406F <= aExp ) { + if ( ( aExp == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) + ) { + return propagateFloat128NaN( a, a ); + } + return a; + } + lastBitMask = 1; + lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; + roundBitsMask = lastBitMask - 1; + z = a; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + if ( lastBitMask ) { + add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); + if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; + } + else { + if ( (sbits64) z.low < 0 ) { + ++z.high; + if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; + } + } + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat128Sign( z ) + ^ ( roundingMode == float_round_up ) ) { + add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); + } + } + z.low &= ~ roundBitsMask; + } + else { + if ( aExp < 0x3FFF ) { + if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; + float_exception_flags |= float_flag_inexact; + aSign = extractFloat128Sign( a ); + switch ( float_rounding_mode ) { + case float_round_nearest_even: + if ( ( aExp == 0x3FFE ) + && ( extractFloat128Frac0( a ) + | extractFloat128Frac1( a ) ) + ) { + return packFloat128( aSign, 0x3FFF, 0, 0 ); + } + break; + case float_round_down: + return + aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) + : packFloat128( 0, 0, 0, 0 ); + case float_round_up: + return + aSign ? packFloat128( 1, 0, 0, 0 ) + : packFloat128( 0, 0x3FFF, 0, 0 ); + } + return packFloat128( aSign, 0, 0, 0 ); + } + lastBitMask = 1; + lastBitMask <<= 0x402F - aExp; + roundBitsMask = lastBitMask - 1; + z.low = 0; + z.high = a.high; + roundingMode = float_rounding_mode; + if ( roundingMode == float_round_nearest_even ) { + z.high += lastBitMask>>1; + if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { + z.high &= ~ lastBitMask; + } + } + else if ( roundingMode != float_round_to_zero ) { + if ( extractFloat128Sign( z ) + ^ ( roundingMode == float_round_up ) ) { + z.high |= ( a.low != 0 ); + z.high += roundBitsMask; + } + } + z.high &= ~ roundBitsMask; + } + if ( ( z.low != a.low ) || ( z.high != a.high ) ) { + float_exception_flags |= float_flag_inexact; + } + return z; + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the absolute values of the quadruple-precision +| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated +| before being returned. `zSign' is ignored if the result is a NaN. +| The addition is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 addFloat128Sigs( float128 a, float128 b, flag zSign ) +{ + int32 aExp, bExp, zExp; + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + int32 expDiff; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + expDiff = aExp - bExp; + if ( 0 < expDiff ) { + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig0 |= LIT64( 0x0001000000000000 ); + } + shift128ExtraRightJamming( + bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); + zExp = aExp; + } + else if ( expDiff < 0 ) { + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig0 |= LIT64( 0x0001000000000000 ); + } + shift128ExtraRightJamming( + aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); + zExp = bExp; + } + else { + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 | bSig0 | bSig1 ) { + return propagateFloat128NaN( a, b ); + } + return a; + } + add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 ); + zSig2 = 0; + zSig0 |= LIT64( 0x0002000000000000 ); + zExp = aExp; + goto shiftRight1; + } + aSig0 |= LIT64( 0x0001000000000000 ); + add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + --zExp; + if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; + ++zExp; + shiftRight1: + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); + roundAndPack: + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the absolute values of the quadruple- +| precision floating-point values `a' and `b'. If `zSign' is 1, the +| difference is negated before being returned. `zSign' is ignored if the +| result is a NaN. The subtraction is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +static float128 subFloat128Sigs( float128 a, float128 b, flag zSign ) +{ + int32 aExp, bExp, zExp; + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; + int32 expDiff; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + expDiff = aExp - bExp; + shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); + shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); + if ( 0 < expDiff ) goto aExpBigger; + if ( expDiff < 0 ) goto bExpBigger; + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 | bSig0 | bSig1 ) { + return propagateFloat128NaN( a, b ); + } + float_raise( float_flag_invalid ); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + if ( aExp == 0 ) { + aExp = 1; + bExp = 1; + } + if ( bSig0 < aSig0 ) goto aBigger; + if ( aSig0 < bSig0 ) goto bBigger; + if ( bSig1 < aSig1 ) goto aBigger; + if ( aSig1 < bSig1 ) goto bBigger; + return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 ); + bExpBigger: + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); + return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + ++expDiff; + } + else { + aSig0 |= LIT64( 0x4000000000000000 ); + } + shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); + bSig0 |= LIT64( 0x4000000000000000 ); + bBigger: + sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); + zExp = bExp; + zSign ^= 1; + goto normalizeRoundAndPack; + aExpBigger: + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + --expDiff; + } + else { + bSig0 |= LIT64( 0x4000000000000000 ); + } + shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); + aSig0 |= LIT64( 0x4000000000000000 ); + aBigger: + sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); + zExp = aExp; + normalizeRoundAndPack: + --zExp; + return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of adding the quadruple-precision floating-point values +| `a' and `b'. The operation is performed according to the IEC/IEEE Standard +| for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_add( float128 a, float128 b ) +{ + flag aSign, bSign; + + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign == bSign ) { + return addFloat128Sigs( a, b, aSign ); + } + else { + return subFloat128Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of subtracting the quadruple-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_sub( float128 a, float128 b ) +{ + flag aSign, bSign; + + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign == bSign ) { + return subFloat128Sigs( a, b, aSign ); + } + else { + return addFloat128Sigs( a, b, aSign ); + } + +} + +/*---------------------------------------------------------------------------- +| Returns the result of multiplying the quadruple-precision floating-point +| values `a' and `b'. The operation is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_mul( float128 a, float128 b ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + bSign = extractFloat128Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( ( aSig0 | aSig1 ) + || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { + return propagateFloat128NaN( a, b ); + } + if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); + if ( ( aExp | aSig0 | aSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + zExp = aExp + bExp - 0x4000; + aSig0 |= LIT64( 0x0001000000000000 ); + shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); + mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); + add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); + zSig2 |= ( zSig3 != 0 ); + if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); + ++zExp; + } + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the result of dividing the quadruple-precision floating-point value +| `a' by the corresponding value `b'. The operation is performed according to +| the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_div( float128 a, float128 b ) +{ + flag aSign, bSign, zSign; + int32 aExp, bExp, zExp; + bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; + bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); + bSign = extractFloat128Sign( b ); + zSign = aSign ^ bSign; + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b ); + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); + goto invalid; + } + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); + return packFloat128( zSign, 0, 0, 0 ); + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) { + if ( ( aExp | aSig0 | aSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + float_raise( float_flag_divbyzero ); + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + zExp = aExp - bExp + 0x3FFD; + shortShift128Left( + aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); + shortShift128Left( + bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); + if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { + shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); + ++zExp; + } + zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); + mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); + sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); + while ( (sbits64) rem0 < 0 ) { + --zSig0; + add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); + } + zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); + if ( ( zSig1 & 0x3FFF ) <= 4 ) { + mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); + sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); + while ( (sbits64) rem1 < 0 ) { + --zSig1; + add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the remainder of the quadruple-precision floating-point value `a' +| with respect to the corresponding value `b'. The operation is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_rem( float128 a, float128 b ) +{ + flag aSign, zSign; + int32 aExp, bExp, expDiff; + bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; + bits64 allZero, alternateASig0, alternateASig1, sigMean1; + sbits64 sigMean0; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + bSig1 = extractFloat128Frac1( b ); + bSig0 = extractFloat128Frac0( b ); + bExp = extractFloat128Exp( b ); +// bSign = extractFloat128Sign( b ); + if ( aExp == 0x7FFF ) { + if ( ( aSig0 | aSig1 ) + || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { + return propagateFloat128NaN( a, b ); + } + goto invalid; + } + if ( bExp == 0x7FFF ) { + if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b ); + return a; + } + if ( bExp == 0 ) { + if ( ( bSig0 | bSig1 ) == 0 ) { + invalid: + float_raise( float_flag_invalid ); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return a; + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + expDiff = aExp - bExp; + if ( expDiff < -1 ) return a; + shortShift128Left( + aSig0 | LIT64( 0x0001000000000000 ), + aSig1, + 15 - ( expDiff < 0 ), + &aSig0, + &aSig1 + ); + shortShift128Left( + bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); + q = le128( bSig0, bSig1, aSig0, aSig1 ); + if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); + expDiff -= 64; + while ( 0 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig0 ); + q = ( 4 < q ) ? q - 4 : 0; + mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); + shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); + shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); + sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); + expDiff -= 61; + } + if ( -64 < expDiff ) { + q = estimateDiv128To64( aSig0, aSig1, bSig0 ); + q = ( 4 < q ) ? q - 4 : 0; + q >>= - expDiff; + shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); + expDiff += 52; + if ( expDiff < 0 ) { + shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); + } + else { + shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); + } + mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); + sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); + } + else { + shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); + shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); + } + do { + alternateASig0 = aSig0; + alternateASig1 = aSig1; + ++q; + sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); + } while ( 0 <= (sbits64) aSig0 ); + add128( + aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); + if ( ( sigMean0 < 0 ) + || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { + aSig0 = alternateASig0; + aSig1 = alternateASig1; + } + zSign = ( (sbits64) aSig0 < 0 ); + if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); + return + normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Returns the square root of the quadruple-precision floating-point value `a'. +| The operation is performed according to the IEC/IEEE Standard for Binary +| Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +float128 float128_sqrt( float128 a ) +{ + flag aSign; + int32 aExp, zExp; + bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; + bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; + float128 z; + + aSig1 = extractFloat128Frac1( a ); + aSig0 = extractFloat128Frac0( a ); + aExp = extractFloat128Exp( a ); + aSign = extractFloat128Sign( a ); + if ( aExp == 0x7FFF ) { + if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a ); + if ( ! aSign ) return a; + goto invalid; + } + if ( aSign ) { + if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; + invalid: + float_raise( float_flag_invalid ); + z.low = float128_default_nan_low; + z.high = float128_default_nan_high; + return z; + } + if ( aExp == 0 ) { + if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); + normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); + } + zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; + aSig0 |= LIT64( 0x0001000000000000 ); + zSig0 = estimateSqrt32( aExp, aSig0>>17 ); + shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); + zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); + doubleZSig0 = zSig0<<1; + mul64To128( zSig0, zSig0, &term0, &term1 ); + sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); + while ( (sbits64) rem0 < 0 ) { + --zSig0; + doubleZSig0 -= 2; + add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); + } + zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); + if ( ( zSig1 & 0x1FFF ) <= 5 ) { + if ( zSig1 == 0 ) zSig1 = 1; + mul64To128( doubleZSig0, zSig1, &term1, &term2 ); + sub128( rem1, 0, term1, term2, &rem1, &rem2 ); + mul64To128( zSig1, zSig1, &term2, &term3 ); + sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); + while ( (sbits64) rem1 < 0 ) { + --zSig1; + shortShift128Left( 0, zSig1, 1, &term2, &term3 ); + term3 |= 1; + term2 |= doubleZSig0; + add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); + } + zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); + } + shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); + return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float128_eq( float128 a, float128 b ) +{ + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. The comparison +| is performed according to the IEC/IEEE Standard for Binary Floating-Point +| Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float128_le( float128 a, float128 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. The comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float128_lt( float128 a, float128 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is equal to +| the corresponding value `b', and 0 otherwise. The invalid exception is +| raised if either operand is a NaN. Otherwise, the comparison is performed +| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float128_eq_signaling( float128 a, float128 b ) +{ + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + float_raise( float_flag_invalid ); + return 0; + } + return + ( a.low == b.low ) + && ( ( a.high == b.high ) + || ( ( a.low == 0 ) + && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) + ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not +| cause an exception. Otherwise, the comparison is performed according to the +| IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float128_le_quiet( float128 a, float128 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + == 0 ); + } + return + aSign ? le128( b.high, b.low, a.high, a.low ) + : le128( a.high, a.low, b.high, b.low ); + +} + +/*---------------------------------------------------------------------------- +| Returns 1 if the quadruple-precision floating-point value `a' is less than +| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an +| exception. Otherwise, the comparison is performed according to the IEC/IEEE +| Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +flag float128_lt_quiet( float128 a, float128 b ) +{ + flag aSign, bSign; + + if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) + && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) + || ( ( extractFloat128Exp( b ) == 0x7FFF ) + && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) + ) { + if ( float128_is_signaling_nan( a ) + || float128_is_signaling_nan( b ) ) { + float_raise( float_flag_invalid ); + } + return 0; + } + aSign = extractFloat128Sign( a ); + bSign = extractFloat128Sign( b ); + if ( aSign != bSign ) { + return + aSign + && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) + != 0 ); + } + return + aSign ? lt128( b.high, b.low, a.high, a.low ) + : lt128( a.high, a.low, b.high, b.low ); + +} + +#endif diff --git a/softfloat/softfloat.h b/softfloat/softfloat.h new file mode 100644 index 00000000..cb21f462 --- /dev/null +++ b/softfloat/softfloat.h @@ -0,0 +1,485 @@ + +/*============================================================================ + +This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic +Package, Release 2b. + +Written by John R. Hauser. This work was made possible in part by the +International Computer Science Institute, located at Suite 600, 1947 Center +Street, Berkeley, California 94704. Funding was partially provided by the +National Science Foundation under grant MIP-9311980. The original version +of this code was written as part of a project to build a fixed-point vector +processor in collaboration with the University of California at Berkeley, +overseen by Profs. Nelson Morgan and John Wawrzynek. More information +is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ +arithmetic/SoftFloat.html'. + +THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has +been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES +RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS +AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, +COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE +EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE +INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR +OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. + +Derivative works are acceptable, even for commercial purposes, so long as +(1) the source code for the derivative work includes prominent notice that +the work is derivative, and (2) the source code includes prominent notice with +these four paragraphs for those parts of this code that are retained. + +=============================================================================*/ + +#ifndef SOFTFLOAT_H +#define SOFTFLOAT_H +/*---------------------------------------------------------------------------- +| The macro `FLOATX80' must be defined to enable the extended double-precision +| floating-point format `floatx80'. If this macro is not defined, the +| `floatx80' type will not be defined, and none of the functions that either +| input or output the `floatx80' type will be defined. The same applies to +| the `FLOATX128' macro and the quadruple-precision format `float128'. +*----------------------------------------------------------------------------*/ +#define FLOATX80 +#define FLOATX128 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point types. +*----------------------------------------------------------------------------*/ + +#include "mamesf.h" + +typedef bits32 float32; +typedef bits64 float64; +#ifdef FLOATX80 +typedef struct { + bits16 high; + bits64 low; +} floatx80; +#endif +#ifdef FLOATX128 +typedef struct { + bits64 high, low; +} float128; +#endif + +/*---------------------------------------------------------------------------- +| Primitive arithmetic functions, including multi-word arithmetic, and +| division and square root approximations. (Can be specialized to target if +| desired.) +*----------------------------------------------------------------------------*/ +#include "softfloat-macros.h" + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point underflow tininess-detection mode. +*----------------------------------------------------------------------------*/ +extern int8 float_detect_tininess; +enum { + float_tininess_after_rounding = 0, + float_tininess_before_rounding = 1 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point rounding mode. +*----------------------------------------------------------------------------*/ +extern int8 float_rounding_mode; +enum { + float_round_nearest_even = 0, + float_round_to_zero = 1, + float_round_down = 2, + float_round_up = 3 +}; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE floating-point exception flags. +*----------------------------------------------------------------------------*/ +extern int8 float_exception_flags; +enum { + float_flag_invalid = 0x01, float_flag_denormal = 0x02, float_flag_divbyzero = 0x04, float_flag_overflow = 0x08, + float_flag_underflow = 0x10, float_flag_inexact = 0x20 +}; + +/*---------------------------------------------------------------------------- +| Routine to raise any or all of the software IEC/IEEE floating-point +| exception flags. +*----------------------------------------------------------------------------*/ +void float_raise( int8 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE integer-to-floating-point conversion routines. +*----------------------------------------------------------------------------*/ +float32 int32_to_float32( int32 ); +float64 int32_to_float64( int32 ); +#ifdef FLOATX80 +floatx80 int32_to_floatx80( int32 ); +#endif +#ifdef FLOATX128 +float128 int32_to_float128( int32 ); +#endif +float32 int64_to_float32( int64 ); +float64 int64_to_float64( int64 ); +#ifdef FLOATX80 +floatx80 int64_to_floatx80( int64 ); +#endif +#ifdef FLOATX128 +float128 int64_to_float128( int64 ); +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float32_to_int32( float32 ); +int32 float32_to_int32_round_to_zero( float32 ); +int64 float32_to_int64( float32 ); +int64 float32_to_int64_round_to_zero( float32 ); +float64 float32_to_float64( float32 ); +#ifdef FLOATX80 +floatx80 float32_to_floatx80( float32 ); +floatx80 float32_to_floatx80_allowunnormal( float32 ); +#endif +#ifdef FLOATX128 +float128 float32_to_float128( float32 ); +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE single-precision operations. +*----------------------------------------------------------------------------*/ +float32 float32_round_to_int( float32 ); +float32 float32_add( float32, float32 ); +float32 float32_sub( float32, float32 ); +float32 float32_mul( float32, float32 ); +float32 float32_div( float32, float32 ); +float32 float32_rem( float32, float32 ); +float32 float32_sqrt( float32 ); +flag float32_eq( float32, float32 ); +flag float32_le( float32, float32 ); +flag float32_lt( float32, float32 ); +flag float32_eq_signaling( float32, float32 ); +flag float32_le_quiet( float32, float32 ); +flag float32_lt_quiet( float32, float32 ); +flag float32_is_signaling_nan( float32 ); + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float64_to_int32( float64 ); +int32 float64_to_int32_round_to_zero( float64 ); +int64 float64_to_int64( float64 ); +int64 float64_to_int64_round_to_zero( float64 ); +float32 float64_to_float32( float64 ); +#ifdef FLOATX80 +floatx80 float64_to_floatx80( float64 ); +floatx80 float64_to_floatx80_allowunnormal( float64 ); +#endif +#ifdef FLOATX128 +float128 float64_to_float128( float64 ); +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE double-precision operations. +*----------------------------------------------------------------------------*/ +float64 float64_round_to_int( float64 ); +float64 float64_add( float64, float64 ); +float64 float64_sub( float64, float64 ); +float64 float64_mul( float64, float64 ); +float64 float64_div( float64, float64 ); +float64 float64_rem( float64, float64 ); +float64 float64_sqrt( float64 ); +flag float64_eq( float64, float64 ); +flag float64_le( float64, float64 ); +flag float64_lt( float64, float64 ); +flag float64_eq_signaling( float64, float64 ); +flag float64_le_quiet( float64, float64 ); +flag float64_lt_quiet( float64, float64 ); +flag float64_is_signaling_nan( float64 ); + +#ifdef FLOATX80 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 floatx80_to_int32( floatx80 ); +int32 floatx80_to_int32_round_to_zero( floatx80 ); +int64 floatx80_to_int64( floatx80 ); +int64 floatx80_to_int64_round_to_zero( floatx80 ); +float32 floatx80_to_float32( floatx80 ); +float64 floatx80_to_float64( floatx80 ); +#ifdef FLOATX128 +float128 floatx80_to_float128( floatx80 ); +#endif +floatx80 floatx80_scale(floatx80 a, floatx80 b); +bits64 extractFloatx80Frac( floatx80 a ); +int32 extractFloatx80Exp( floatx80 a ); + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an +| extended double-precision floating-point value, returning the result. +*----------------------------------------------------------------------------*/ + +INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) +{ + floatx80 z; + + z.low = zSig; + z.high = ( ( (bits16) zSign )<<15 ) + zExp; + return z; + +} + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision rounding precision. Valid +| values are 32, 64, and 80. +*----------------------------------------------------------------------------*/ +extern int8 floatx80_rounding_precision; + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE extended double-precision operations. +*----------------------------------------------------------------------------*/ +floatx80 floatx80_round_to_int( floatx80 ); +floatx80 floatx80_round32( floatx80 ); +floatx80 floatx80_normalize( floatx80 ); +floatx80 floatx80_add( floatx80, floatx80 ); +floatx80 floatx80_sub( floatx80, floatx80 ); +floatx80 floatx80_mul( floatx80, floatx80 ); +floatx80 floatx80_div( floatx80, floatx80 ); +floatx80 floatx80_rem( floatx80, floatx80 ); +floatx80 floatx80_sqrt( floatx80 ); +flag floatx80_eq( floatx80, floatx80 ); +flag floatx80_le( floatx80, floatx80 ); +flag floatx80_lt( floatx80, floatx80 ); +flag floatx80_eq_signaling( floatx80, floatx80 ); +flag floatx80_le_quiet( floatx80, floatx80 ); +flag floatx80_lt_quiet( floatx80, floatx80 ); + +flag floatx80_is_signaling_nan( floatx80 ); +flag floatx80_is_nan( floatx80 ); +flag floatx80_is_zero( floatx80 ); +flag floatx80_is_infinity( floatx80 ); +flag floatx80_is_negative( floatx80 ); +flag floatx80_is_denormal( floatx80 ); +flag floatx80_is_unnormal( floatx80 ); +flag floatx80_is_normal( floatx80 ); + +int floatx80_fsincos(floatx80 a, floatx80 *sin_a, floatx80 *cos_a); +int floatx80_fsin(floatx80 *a); +int floatx80_fcos(floatx80 *a); +int floatx80_ftan(floatx80 *a); + +floatx80 floatx80_flognp1(floatx80 a); +floatx80 floatx80_flogn(floatx80 a); +floatx80 floatx80_flog2(floatx80 a); +floatx80 floatx80_flog10(floatx80 a); + +floatx80 floatx80_getman( floatx80 a ); +floatx80 floatx80_getexp( floatx80 a ); + +// roundAndPackFloatx80 used to be in softfloat-round-pack, is now in softfloat.c +floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1); + +#endif + +#ifdef FLOATX128 + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision conversion routines. +*----------------------------------------------------------------------------*/ +int32 float128_to_int32( float128 ); +int32 float128_to_int32_round_to_zero( float128 ); +int64 float128_to_int64( float128 ); +int64 float128_to_int64_round_to_zero( float128 ); +float32 float128_to_float32( float128 ); +float64 float128_to_float64( float128 ); +#ifdef FLOATX80 +floatx80 float128_to_floatx80( float128 ); +#endif + +/*---------------------------------------------------------------------------- +| Software IEC/IEEE quadruple-precision operations. +*----------------------------------------------------------------------------*/ +float128 float128_round_to_int( float128 ); +float128 float128_add( float128, float128 ); +float128 float128_sub( float128, float128 ); +float128 float128_mul( float128, float128 ); +float128 float128_div( float128, float128 ); +float128 float128_rem( float128, float128 ); +float128 float128_sqrt( float128 ); +flag float128_eq( float128, float128 ); +flag float128_le( float128, float128 ); +flag float128_lt( float128, float128 ); +flag float128_eq_signaling( float128, float128 ); +flag float128_le_quiet( float128, float128 ); +flag float128_lt_quiet( float128, float128 ); +flag float128_is_signaling_nan( float128 ); + +/*---------------------------------------------------------------------------- +| Packs the sign `zSign', the exponent `zExp', and the significand formed +| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision +| floating-point value, returning the result. After being shifted into the +| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply +| added together to form the most significant 32 bits of the result. This +| means that any integer portion of `zSig0' will be added into the exponent. +| Since a properly normalized significand will have an integer portion equal +| to 1, the `zExp' input should be 1 less than the desired result exponent +| whenever `zSig0' and `zSig1' concatenated form a complete, normalized +| significand. +*----------------------------------------------------------------------------*/ + +INLINE float128 + packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) +{ + float128 z; + + z.low = zSig1; + z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; + return z; + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and extended significand formed by the concatenation of `zSig0', `zSig1', +| and `zSig2', and returns the proper quadruple-precision floating-point value +| corresponding to the abstract input. Ordinarily, the abstract value is +| simply rounded and packed into the quadruple-precision format, with the +| inexact exception raised if the abstract input cannot be represented +| exactly. However, if the abstract value is too large, the overflow and +| inexact exceptions are raised and an infinity or maximal finite value is +| returned. If the abstract value is too small, the input value is rounded to +| a subnormal number, and the underflow and inexact exceptions are raised if +| the abstract input cannot be represented exactly as a subnormal quadruple- +| precision floating-point number. +| The input significand must be normalized or smaller. If the input +| significand is not normalized, `zExp' must be 0; in that case, the result +| returned is a subnormal number, and it must not require rounding. In the +| usual case that the input significand is normalized, `zExp' must be 1 less +| than the ``true'' floating-point exponent. The handling of underflow and +| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. +*----------------------------------------------------------------------------*/ + +INLINE float128 + roundAndPackFloat128( + flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 ) +{ + int8 roundingMode; + flag roundNearestEven, increment, isTiny; + + roundingMode = float_rounding_mode; + roundNearestEven = ( roundingMode == float_round_nearest_even ); + increment = ( (sbits64) zSig2 < 0 ); + if ( ! roundNearestEven ) { + if ( roundingMode == float_round_to_zero ) { + increment = 0; + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && zSig2; + } + else { + increment = ( roundingMode == float_round_up ) && zSig2; + } + } + } + if ( 0x7FFD <= (bits32) zExp ) { + if ( ( 0x7FFD < zExp ) + || ( ( zExp == 0x7FFD ) + && eq128( + LIT64( 0x0001FFFFFFFFFFFF ), + LIT64( 0xFFFFFFFFFFFFFFFF ), + zSig0, + zSig1 + ) + && increment + ) + ) { + float_raise( float_flag_overflow | float_flag_inexact ); + if ( ( roundingMode == float_round_to_zero ) + || ( zSign && ( roundingMode == float_round_up ) ) + || ( ! zSign && ( roundingMode == float_round_down ) ) + ) { + return + packFloat128( + zSign, + 0x7FFE, + LIT64( 0x0000FFFFFFFFFFFF ), + LIT64( 0xFFFFFFFFFFFFFFFF ) + ); + } + return packFloat128( zSign, 0x7FFF, 0, 0 ); + } + if ( zExp < 0 ) { + isTiny = + ( float_detect_tininess == float_tininess_before_rounding ) + || ( zExp < -1 ) + || ! increment + || lt128( + zSig0, + zSig1, + LIT64( 0x0001FFFFFFFFFFFF ), + LIT64( 0xFFFFFFFFFFFFFFFF ) + ); + shift128ExtraRightJamming( + zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); + zExp = 0; + if ( isTiny && zSig2 ) float_raise( float_flag_underflow ); + if ( roundNearestEven ) { + increment = ( (sbits64) zSig2 < 0 ); + } + else { + if ( zSign ) { + increment = ( roundingMode == float_round_down ) && zSig2; + } + else { + increment = ( roundingMode == float_round_up ) && zSig2; + } + } + } + } + if ( zSig2 ) float_exception_flags |= float_flag_inexact; + if ( increment ) { + add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); + zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); + } + else { + if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; + } + return packFloat128( zSign, zExp, zSig0, zSig1 ); + +} + +/*---------------------------------------------------------------------------- +| Takes an abstract floating-point value having sign `zSign', exponent `zExp', +| and significand formed by the concatenation of `zSig0' and `zSig1', and +| returns the proper quadruple-precision floating-point value corresponding +| to the abstract input. This routine is just like `roundAndPackFloat128' +| except that the input significand has fewer bits and does not have to be +| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- +| point exponent. +*----------------------------------------------------------------------------*/ + +INLINE float128 + normalizeRoundAndPackFloat128( + flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) +{ + int8 shiftCount; + bits64 zSig2; + + if ( zSig0 == 0 ) { + zSig0 = zSig1; + zSig1 = 0; + zExp -= 64; + } + shiftCount = countLeadingZeros64( zSig0 ) - 15; + if ( 0 <= shiftCount ) { + zSig2 = 0; + shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); + } + else { + shift128ExtraRightJamming( + zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); + } + zExp -= shiftCount; + return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 ); + +} +#endif + +#endif //SOFTFLOAT_H \ No newline at end of file -- 2.47.3