diff --git a/benchmarks/math_benchmark.cpp b/benchmarks/math_benchmark.cpp index 3602de43b..a9748cd63 100644 --- a/benchmarks/math_benchmark.cpp +++ b/benchmarks/math_benchmark.cpp @@ -16,6 +16,7 @@ #include "benchmark.h" +#include #include // Avoid optimization. @@ -113,10 +114,49 @@ static void BM_math_isinf_ZERO(int iters) { } BENCHMARK(BM_math_isinf_ZERO); +static void BM_math_sin_fast(int iters) { + StartBenchmarkTiming(); + d = 1.0; + for (int i = 0; i < iters; ++i) { + d += sin(d); + } + StopBenchmarkTiming(); +} +BENCHMARK(BM_math_sin_fast); +static void BM_math_sin_feupdateenv(int iters) { + StartBenchmarkTiming(); + d = 1.0; + for (int i = 0; i < iters; ++i) { + fenv_t __libc_save_rm; + feholdexcept(&__libc_save_rm); + fesetround(FE_TONEAREST); + d += sin(d); + feupdateenv(&__libc_save_rm); + } + + StopBenchmarkTiming(); +} +BENCHMARK(BM_math_sin_feupdateenv); + +static void BM_math_sin_fesetenv(int iters) { + StartBenchmarkTiming(); + + d = 1.0; + for (int i = 0; i < iters; ++i) { + fenv_t __libc_save_rm; + feholdexcept(&__libc_save_rm); + fesetround(FE_TONEAREST); + d += sin(d); + fesetenv(&__libc_save_rm); + } + + StopBenchmarkTiming(); +} +BENCHMARK(BM_math_sin_fesetenv); static void BM_math_fpclassify_NORMAL(int iters) { StartBenchmarkTiming(); diff --git a/libm/arm64/fenv.c b/libm/arm64/fenv.c index 9db21efd9..ce560a707 100644 --- a/libm/arm64/fenv.c +++ b/libm/arm64/fenv.c @@ -28,114 +28,168 @@ #include -/* - * Hopefully the system ID byte is immutable, so it's valid to use - * this as a default environment. - */ -const fenv_t __fe_dfl_env = 0; +#define FPCR_EXCEPT_SHIFT 8 +#define FPCR_EXCEPT_MASK (FE_ALL_EXCEPT << FPCR_EXCEPT_SHIFT) -int fegetenv(fenv_t* __envp) { - fenv_t _fpcr, _fpsr; - __asm__ __volatile__("mrs %0,fpcr" : "=r" (_fpcr)); - __asm__ __volatile__("mrs %0,fpsr" : "=r" (_fpsr)); - *__envp = (_fpcr | _fpsr); +#define FPCR_RMODE_SHIFT 22 + +const fenv_t __fe_dfl_env = { 0 /* control */, 0 /* status */}; + +typedef __uint32_t fpu_control_t; // FPCR, Floating-point Control Register. +typedef __uint32_t fpu_status_t; // FPSR, Floating-point Status Register. + +#define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r" (__fpcr)) +#define __get_fpsr(__fpsr) __asm__ __volatile__("mrs %0,fpsr" : "=r" (__fpsr)) +#define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : :"ri" (__fpcr)) +#define __set_fpsr(__fpsr) __asm__ __volatile__("msr fpsr,%0" : :"ri" (__fpsr)) + +int fegetenv(fenv_t* envp) { + __get_fpcr(envp->__control); + __get_fpsr(envp->__status); return 0; } -int fesetenv(const fenv_t* __envp) { - fenv_t _fpcr = (*__envp & FPCR_MASK); - fenv_t _fpsr = (*__envp & FPSR_MASK); - __asm__ __volatile__("msr fpcr,%0" : :"ri" (_fpcr)); - __asm__ __volatile__("msr fpsr,%0" : :"ri" (_fpsr)); +int fesetenv(const fenv_t* envp) { + fpu_control_t fpcr; + + __get_fpcr(fpcr); + if (envp->__control != fpcr) { + __set_fpcr(envp->__control); + } + __set_fpsr(envp->__status); return 0; } -int feclearexcept(int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - __fpscr &= ~__excepts; - fesetenv(&__fpscr); +int feclearexcept(int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + fpsr &= ~excepts; + __set_fpsr(fpsr); return 0; } -int fegetexceptflag(fexcept_t* __flagp, int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - *__flagp = __fpscr & __excepts; +int fegetexceptflag(fexcept_t* flagp, int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + *flagp = fpsr & excepts; return 0; } -int fesetexceptflag(const fexcept_t* __flagp, int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - __fpscr &= ~__excepts; - __fpscr |= *__flagp & __excepts; - fesetenv(&__fpscr); +int fesetexceptflag(const fexcept_t* flagp, int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + fpsr &= ~excepts; + fpsr |= *flagp & excepts; + __set_fpsr(fpsr); return 0; } -int feraiseexcept(int __excepts) { - fexcept_t __ex = __excepts; - fesetexceptflag(&__ex, __excepts); +int feraiseexcept(int excepts) { + fexcept_t ex = excepts; + + fesetexceptflag(&ex, excepts); return 0; } -int fetestexcept(int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - return (__fpscr & __excepts); +int fetestexcept(int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + return (fpsr & excepts); } int fegetround(void) { - fenv_t _fpscr; - fegetenv(&_fpscr); - return ((_fpscr >> _FPSCR_RMODE_SHIFT) & 0x3); + fpu_control_t fpcr; + + __get_fpcr(fpcr); + return ((fpcr >> FPCR_RMODE_SHIFT) & FE_TOWARDZERO); } -int fesetround(int __round) { - fenv_t _fpscr; - fegetenv(&_fpscr); - _fpscr &= ~(0x3 << _FPSCR_RMODE_SHIFT); - _fpscr |= (__round << _FPSCR_RMODE_SHIFT); - fesetenv(&_fpscr); +int fesetround(int round) { + fpu_control_t fpcr, new_fpcr; + + round &= FE_TOWARDZERO; + __get_fpcr(fpcr); + new_fpcr = fpcr & ~(FE_TOWARDZERO << FPCR_RMODE_SHIFT); + new_fpcr |= (round << FPCR_RMODE_SHIFT); + if (new_fpcr != fpcr) { + __set_fpcr(new_fpcr); + } return 0; } -int feholdexcept(fenv_t* __envp) { - fenv_t __env; - fegetenv(&__env); - *__envp = __env; - __env &= ~(FE_ALL_EXCEPT | _FPSCR_ENABLE_MASK); - fesetenv(&__env); +int feholdexcept(fenv_t* envp) { + fenv_t env; + fpu_status_t fpsr; + fpu_control_t fpcr, new_fpcr; + + __get_fpsr(fpsr); + __get_fpcr(fpcr); + env.__status = fpsr; + env.__control = fpcr; + *envp = env; + + // Set exceptions to untrapped. + new_fpcr = fpcr & ~(FE_ALL_EXCEPT << FPCR_EXCEPT_SHIFT); + if (new_fpcr != fpcr) { + __set_fpcr(new_fpcr); + } + + // Clear all exceptions. + fpsr &= ~FE_ALL_EXCEPT; + __set_fpsr(fpsr); return 0; } -int feupdateenv(const fenv_t* __envp) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - fesetenv(__envp); - feraiseexcept(__fpscr & FE_ALL_EXCEPT); +int feupdateenv(const fenv_t* envp) { + fpu_status_t fpsr; + fpu_control_t fpcr; + + // Set FPU Control register. + __get_fpcr(fpcr); + if (envp->__control != fpcr) { + __set_fpcr(envp->__control); + } + + // Set FPU Status register to status | currently raised exceptions. + __get_fpsr(fpsr); + fpsr = envp->__status | (fpsr & FE_ALL_EXCEPT); + __set_fpsr(fpsr); return 0; } -int feenableexcept(int __mask) { - fenv_t __old_fpscr, __new_fpscr; - fegetenv(&__old_fpscr); - __new_fpscr = __old_fpscr | (__mask & FE_ALL_EXCEPT) << _FPSCR_ENABLE_SHIFT; - fesetenv(&__new_fpscr); - return ((__old_fpscr >> _FPSCR_ENABLE_SHIFT) & FE_ALL_EXCEPT); +int feenableexcept(int mask) { + fpu_control_t old_fpcr, new_fpcr; + + __get_fpcr(old_fpcr); + new_fpcr = old_fpcr | ((mask & FE_ALL_EXCEPT) << FPCR_EXCEPT_SHIFT); + if (new_fpcr != old_fpcr) { + __set_fpcr(new_fpcr); + } + return ((old_fpcr >> FPCR_EXCEPT_SHIFT) & FE_ALL_EXCEPT); } -int fedisableexcept(int __mask) { - fenv_t __old_fpscr, __new_fpscr; - fegetenv(&__old_fpscr); - __new_fpscr = __old_fpscr & ~((__mask & FE_ALL_EXCEPT) << _FPSCR_ENABLE_SHIFT); - fesetenv(&__new_fpscr); - return ((__old_fpscr >> _FPSCR_ENABLE_SHIFT) & FE_ALL_EXCEPT); +int fedisableexcept(int mask) { + fpu_control_t old_fpcr, new_fpcr; + + __get_fpcr(old_fpcr); + new_fpcr = old_fpcr & ~((mask & FE_ALL_EXCEPT) << FPCR_EXCEPT_SHIFT); + if (new_fpcr != old_fpcr) { + __set_fpcr(new_fpcr); + } + return ((old_fpcr >> FPCR_EXCEPT_SHIFT) & FE_ALL_EXCEPT); } int fegetexcept(void) { - fenv_t __fpscr; - fegetenv(&__fpscr); - return ((__fpscr & _FPSCR_ENABLE_MASK) >> _FPSCR_ENABLE_SHIFT); + fpu_control_t fpcr; + + __get_fpcr(fpcr); + return ((fpcr & FPCR_EXCEPT_MASK) >> FPCR_EXCEPT_SHIFT); } diff --git a/libm/include/arm64/machine/fenv.h b/libm/include/arm64/machine/fenv.h index 2efeee3da..a8568b854 100644 --- a/libm/include/arm64/machine/fenv.h +++ b/libm/include/arm64/machine/fenv.h @@ -27,15 +27,44 @@ */ /* - * Rewritten for Android. + * In ARMv8, AArch64 state, floating-point operation is controlled by: * - * The ARM FPSCR (Floating-point Status and Control Register) described here: - * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0344b/Chdfafia.html - * has been split into the FPCR (Floating-point Control Register) and FPSR - * (Floating-point Status Register) on the ARMv8. These are described briefly in - * "Procedure Call Standard for the ARM 64-bit Architecture" - * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055a/IHI0055A_aapcs64.pdf - * section 5.1.2 SIMD and Floating-Point Registers + * * FPCR - 32Bit Floating-Point Control Register: + * * [31:27] - Reserved, Res0; + * * [26] - AHP, Alternative half-precision control bit; + * * [25] - DN, Default NaN mode control bit; + * * [24] - FZ, Flush-to-zero mode control bit; + * * [23:22] - RMode, Rounding Mode control field: + * * 00 - Round to Nearest (RN) mode; + * * 01 - Round towards Plus Infinity (RP) mode; + * * 10 - Round towards Minus Infinity (RM) mode; + * * 11 - Round towards Zero (RZ) mode. + * * [21:20] - Stride, ignored during AArch64 execution; + * * [19] - Reserved, Res0; + * * [18:16] - Len, ignored during AArch64 execution; + * * [15] - IDE, Input Denormal exception trap; + * * [14:13] - Reserved, Res0; + * * [12] - IXE, Inexact exception trap; + * * [11] - UFE, Underflow exception trap; + * * [10] - OFE, Overflow exception trap; + * * [9] - DZE, Division by Zero exception; + * * [8] - IOE, Invalid Operation exception; + * * [7:0] - Reserved, Res0. + * + * * FPSR - 32Bit Floating-Point Status Register: + * * [31] - N, Negative condition flag for AArch32 (AArch64 sets PSTATE.N); + * * [30] - Z, Zero condition flag for AArch32 (AArch64 sets PSTATE.Z); + * * [29] - C, Carry conditon flag for AArch32 (AArch64 sets PSTATE.C); + * * [28] - V, Overflow conditon flag for AArch32 (AArch64 sets PSTATE.V); + * * [27] - QC, Cumulative saturation bit, Advanced SIMD only; + * * [26:8] - Reserved, Res0; + * * [7] - IDC, Input Denormal cumulative exception; + * * [6:5] - Reserved, Res0; + * * [4] - IXC, Inexact cumulative exception; + * * [3] - UFC, Underflow cumulative exception; + * * [2] - OFC, Overflow cumulative exception; + * * [1] - DZC, Division by Zero cumulative exception; + * * [0] - IOC, Invalid Operation cumulative exception. */ #ifndef _ARM64_FENV_H_ @@ -45,7 +74,11 @@ __BEGIN_DECLS -typedef __uint32_t fenv_t; +typedef struct { + __uint32_t __control; /* FPCR, Floating-point Control Register */ + __uint32_t __status; /* FPSR, Floating-point Status Register */ +} fenv_t; + typedef __uint32_t fexcept_t; /* Exception flags. */ @@ -54,11 +87,9 @@ typedef __uint32_t fexcept_t; #define FE_OVERFLOW 0x04 #define FE_UNDERFLOW 0x08 #define FE_INEXACT 0x10 +#define FE_DENORMAL 0x80 #define FE_ALL_EXCEPT (FE_DIVBYZERO | FE_INEXACT | FE_INVALID | \ - FE_OVERFLOW | FE_UNDERFLOW) - -#define _FPSCR_ENABLE_SHIFT 8 -#define _FPSCR_ENABLE_MASK (FE_ALL_EXCEPT << _FPSCR_ENABLE_SHIFT) + FE_OVERFLOW | FE_UNDERFLOW | FE_DENORMAL) /* Rounding modes. */ #define FE_TONEAREST 0x0 @@ -66,56 +97,6 @@ typedef __uint32_t fexcept_t; #define FE_DOWNWARD 0x2 #define FE_TOWARDZERO 0x3 -#define _FPSCR_RMODE_SHIFT 22 - -#define FPCR_IOE (1 << 8) -#define FPCR_DZE (1 << 9) -#define FPCR_OFE (1 << 10) -#define FPCR_UFE (1 << 11) -#define FPCR_IXE (1 << 12) -#define FPCR_IDE (1 << 15) -#define FPCR_LEN (7 << 16) -#define FPCR_STRIDE (3 << 20) -#define FPCR_RMODE (3 << 22) -#define FPCR_FZ (1 << 24) -#define FPCR_DN (1 << 25) -#define FPCR_AHP (1 << 26) -#define FPCR_MASK (FPCR_IOE | \ - FPCR_DZE | \ - FPCR_OFE | \ - FPCR_UFE | \ - FPCR_IXE | \ - FPCR_IDE | \ - FPCR_LEN | \ - FPCR_STRIDE | \ - FPCR_RMODE | \ - FPCR_FZ | \ - FPCR_DN | \ - FPCR_AHP ) - -#define FPSR_IOC (1 << 0) -#define FPSR_DZC (1 << 1) -#define FPSR_OFC (1 << 2) -#define FPSR_UFC (1 << 3) -#define FPSR_IXC (1 << 4) -#define FPSR_IDC (1 << 7) -#define FPSR_QC (1 << 27) -#define FPSR_V (1 << 28) -#define FPSR_C (1 << 29) -#define FPSR_Z (1 << 30) -#define FPSR_N (1 << 31) -#define FPSR_MASK (FPSR_IOC | \ - FPSR_DZC | \ - FPSR_OFC | \ - FPSR_UFC | \ - FPSR_IXC | \ - FPSR_IDC | \ - FPSR_QC | \ - FPSR_V | \ - FPSR_C | \ - FPSR_Z | \ - FPSR_N ) - __END_DECLS #endif /* !_ARM64_FENV_H_ */