From a147a1da5c268e9d556c207be0d3da0a519b2d54 Mon Sep 17 00:00:00 2001 From: Serban Constantinescu Date: Sun, 8 Jun 2014 16:55:22 +0100 Subject: [PATCH] AArch64: libm: Fix ARM64 fenv_t and refactor ARM64 libm implementation. This patch fixes the ARM64 ABI for libm. fenv_t is now split in 32bit status and 32bit control. This mirrors the AArch64 FPU control and status registers (FPCR, FPSR). The patch also refactors the libm implementation for ARM64 into a finer grained control over the FPU registers. Bionic-benchmarks has been expanded with 3 more benchmarks for floating point operations. The new libm implementation for ARM64 performs better over all the math benchmarks available. Change-Id: I2a7f81d6b4e55c91f8a63a4c69614fc8b1bcf2db Signed-off-by: Serban Constantinescu --- benchmarks/math_benchmark.cpp | 40 ++++++ libm/arm64/fenv.c | 198 +++++++++++++++++++----------- libm/include/arm64/machine/fenv.h | 107 +++++++--------- 3 files changed, 210 insertions(+), 135 deletions(-) diff --git a/benchmarks/math_benchmark.cpp b/benchmarks/math_benchmark.cpp index 3602de43b..a9748cd63 100644 --- a/benchmarks/math_benchmark.cpp +++ b/benchmarks/math_benchmark.cpp @@ -16,6 +16,7 @@ #include "benchmark.h" +#include #include // Avoid optimization. @@ -113,10 +114,49 @@ static void BM_math_isinf_ZERO(int iters) { } BENCHMARK(BM_math_isinf_ZERO); +static void BM_math_sin_fast(int iters) { + StartBenchmarkTiming(); + d = 1.0; + for (int i = 0; i < iters; ++i) { + d += sin(d); + } + StopBenchmarkTiming(); +} +BENCHMARK(BM_math_sin_fast); +static void BM_math_sin_feupdateenv(int iters) { + StartBenchmarkTiming(); + d = 1.0; + for (int i = 0; i < iters; ++i) { + fenv_t __libc_save_rm; + feholdexcept(&__libc_save_rm); + fesetround(FE_TONEAREST); + d += sin(d); + feupdateenv(&__libc_save_rm); + } + + StopBenchmarkTiming(); +} +BENCHMARK(BM_math_sin_feupdateenv); + +static void BM_math_sin_fesetenv(int iters) { + StartBenchmarkTiming(); + + d = 1.0; + for (int i = 0; i < iters; ++i) { + fenv_t __libc_save_rm; + feholdexcept(&__libc_save_rm); + fesetround(FE_TONEAREST); + d += sin(d); + fesetenv(&__libc_save_rm); + } + + StopBenchmarkTiming(); +} +BENCHMARK(BM_math_sin_fesetenv); static void BM_math_fpclassify_NORMAL(int iters) { StartBenchmarkTiming(); diff --git a/libm/arm64/fenv.c b/libm/arm64/fenv.c index 9db21efd9..ce560a707 100644 --- a/libm/arm64/fenv.c +++ b/libm/arm64/fenv.c @@ -28,114 +28,168 @@ #include -/* - * Hopefully the system ID byte is immutable, so it's valid to use - * this as a default environment. - */ -const fenv_t __fe_dfl_env = 0; +#define FPCR_EXCEPT_SHIFT 8 +#define FPCR_EXCEPT_MASK (FE_ALL_EXCEPT << FPCR_EXCEPT_SHIFT) -int fegetenv(fenv_t* __envp) { - fenv_t _fpcr, _fpsr; - __asm__ __volatile__("mrs %0,fpcr" : "=r" (_fpcr)); - __asm__ __volatile__("mrs %0,fpsr" : "=r" (_fpsr)); - *__envp = (_fpcr | _fpsr); +#define FPCR_RMODE_SHIFT 22 + +const fenv_t __fe_dfl_env = { 0 /* control */, 0 /* status */}; + +typedef __uint32_t fpu_control_t; // FPCR, Floating-point Control Register. +typedef __uint32_t fpu_status_t; // FPSR, Floating-point Status Register. + +#define __get_fpcr(__fpcr) __asm__ __volatile__("mrs %0,fpcr" : "=r" (__fpcr)) +#define __get_fpsr(__fpsr) __asm__ __volatile__("mrs %0,fpsr" : "=r" (__fpsr)) +#define __set_fpcr(__fpcr) __asm__ __volatile__("msr fpcr,%0" : :"ri" (__fpcr)) +#define __set_fpsr(__fpsr) __asm__ __volatile__("msr fpsr,%0" : :"ri" (__fpsr)) + +int fegetenv(fenv_t* envp) { + __get_fpcr(envp->__control); + __get_fpsr(envp->__status); return 0; } -int fesetenv(const fenv_t* __envp) { - fenv_t _fpcr = (*__envp & FPCR_MASK); - fenv_t _fpsr = (*__envp & FPSR_MASK); - __asm__ __volatile__("msr fpcr,%0" : :"ri" (_fpcr)); - __asm__ __volatile__("msr fpsr,%0" : :"ri" (_fpsr)); +int fesetenv(const fenv_t* envp) { + fpu_control_t fpcr; + + __get_fpcr(fpcr); + if (envp->__control != fpcr) { + __set_fpcr(envp->__control); + } + __set_fpsr(envp->__status); return 0; } -int feclearexcept(int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - __fpscr &= ~__excepts; - fesetenv(&__fpscr); +int feclearexcept(int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + fpsr &= ~excepts; + __set_fpsr(fpsr); return 0; } -int fegetexceptflag(fexcept_t* __flagp, int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - *__flagp = __fpscr & __excepts; +int fegetexceptflag(fexcept_t* flagp, int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + *flagp = fpsr & excepts; return 0; } -int fesetexceptflag(const fexcept_t* __flagp, int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - __fpscr &= ~__excepts; - __fpscr |= *__flagp & __excepts; - fesetenv(&__fpscr); +int fesetexceptflag(const fexcept_t* flagp, int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + fpsr &= ~excepts; + fpsr |= *flagp & excepts; + __set_fpsr(fpsr); return 0; } -int feraiseexcept(int __excepts) { - fexcept_t __ex = __excepts; - fesetexceptflag(&__ex, __excepts); +int feraiseexcept(int excepts) { + fexcept_t ex = excepts; + + fesetexceptflag(&ex, excepts); return 0; } -int fetestexcept(int __excepts) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - return (__fpscr & __excepts); +int fetestexcept(int excepts) { + fpu_status_t fpsr; + + excepts &= FE_ALL_EXCEPT; + __get_fpsr(fpsr); + return (fpsr & excepts); } int fegetround(void) { - fenv_t _fpscr; - fegetenv(&_fpscr); - return ((_fpscr >> _FPSCR_RMODE_SHIFT) & 0x3); + fpu_control_t fpcr; + + __get_fpcr(fpcr); + return ((fpcr >> FPCR_RMODE_SHIFT) & FE_TOWARDZERO); } -int fesetround(int __round) { - fenv_t _fpscr; - fegetenv(&_fpscr); - _fpscr &= ~(0x3 << _FPSCR_RMODE_SHIFT); - _fpscr |= (__round << _FPSCR_RMODE_SHIFT); - fesetenv(&_fpscr); +int fesetround(int round) { + fpu_control_t fpcr, new_fpcr; + + round &= FE_TOWARDZERO; + __get_fpcr(fpcr); + new_fpcr = fpcr & ~(FE_TOWARDZERO << FPCR_RMODE_SHIFT); + new_fpcr |= (round << FPCR_RMODE_SHIFT); + if (new_fpcr != fpcr) { + __set_fpcr(new_fpcr); + } return 0; } -int feholdexcept(fenv_t* __envp) { - fenv_t __env; - fegetenv(&__env); - *__envp = __env; - __env &= ~(FE_ALL_EXCEPT | _FPSCR_ENABLE_MASK); - fesetenv(&__env); +int feholdexcept(fenv_t* envp) { + fenv_t env; + fpu_status_t fpsr; + fpu_control_t fpcr, new_fpcr; + + __get_fpsr(fpsr); + __get_fpcr(fpcr); + env.__status = fpsr; + env.__control = fpcr; + *envp = env; + + // Set exceptions to untrapped. + new_fpcr = fpcr & ~(FE_ALL_EXCEPT << FPCR_EXCEPT_SHIFT); + if (new_fpcr != fpcr) { + __set_fpcr(new_fpcr); + } + + // Clear all exceptions. + fpsr &= ~FE_ALL_EXCEPT; + __set_fpsr(fpsr); return 0; } -int feupdateenv(const fenv_t* __envp) { - fexcept_t __fpscr; - fegetenv(&__fpscr); - fesetenv(__envp); - feraiseexcept(__fpscr & FE_ALL_EXCEPT); +int feupdateenv(const fenv_t* envp) { + fpu_status_t fpsr; + fpu_control_t fpcr; + + // Set FPU Control register. + __get_fpcr(fpcr); + if (envp->__control != fpcr) { + __set_fpcr(envp->__control); + } + + // Set FPU Status register to status | currently raised exceptions. + __get_fpsr(fpsr); + fpsr = envp->__status | (fpsr & FE_ALL_EXCEPT); + __set_fpsr(fpsr); return 0; } -int feenableexcept(int __mask) { - fenv_t __old_fpscr, __new_fpscr; - fegetenv(&__old_fpscr); - __new_fpscr = __old_fpscr | (__mask & FE_ALL_EXCEPT) << _FPSCR_ENABLE_SHIFT; - fesetenv(&__new_fpscr); - return ((__old_fpscr >> _FPSCR_ENABLE_SHIFT) & FE_ALL_EXCEPT); +int feenableexcept(int mask) { + fpu_control_t old_fpcr, new_fpcr; + + __get_fpcr(old_fpcr); + new_fpcr = old_fpcr | ((mask & FE_ALL_EXCEPT) << FPCR_EXCEPT_SHIFT); + if (new_fpcr != old_fpcr) { + __set_fpcr(new_fpcr); + } + return ((old_fpcr >> FPCR_EXCEPT_SHIFT) & FE_ALL_EXCEPT); } -int fedisableexcept(int __mask) { - fenv_t __old_fpscr, __new_fpscr; - fegetenv(&__old_fpscr); - __new_fpscr = __old_fpscr & ~((__mask & FE_ALL_EXCEPT) << _FPSCR_ENABLE_SHIFT); - fesetenv(&__new_fpscr); - return ((__old_fpscr >> _FPSCR_ENABLE_SHIFT) & FE_ALL_EXCEPT); +int fedisableexcept(int mask) { + fpu_control_t old_fpcr, new_fpcr; + + __get_fpcr(old_fpcr); + new_fpcr = old_fpcr & ~((mask & FE_ALL_EXCEPT) << FPCR_EXCEPT_SHIFT); + if (new_fpcr != old_fpcr) { + __set_fpcr(new_fpcr); + } + return ((old_fpcr >> FPCR_EXCEPT_SHIFT) & FE_ALL_EXCEPT); } int fegetexcept(void) { - fenv_t __fpscr; - fegetenv(&__fpscr); - return ((__fpscr & _FPSCR_ENABLE_MASK) >> _FPSCR_ENABLE_SHIFT); + fpu_control_t fpcr; + + __get_fpcr(fpcr); + return ((fpcr & FPCR_EXCEPT_MASK) >> FPCR_EXCEPT_SHIFT); } diff --git a/libm/include/arm64/machine/fenv.h b/libm/include/arm64/machine/fenv.h index 2efeee3da..a8568b854 100644 --- a/libm/include/arm64/machine/fenv.h +++ b/libm/include/arm64/machine/fenv.h @@ -27,15 +27,44 @@ */ /* - * Rewritten for Android. + * In ARMv8, AArch64 state, floating-point operation is controlled by: * - * The ARM FPSCR (Floating-point Status and Control Register) described here: - * http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0344b/Chdfafia.html - * has been split into the FPCR (Floating-point Control Register) and FPSR - * (Floating-point Status Register) on the ARMv8. These are described briefly in - * "Procedure Call Standard for the ARM 64-bit Architecture" - * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055a/IHI0055A_aapcs64.pdf - * section 5.1.2 SIMD and Floating-Point Registers + * * FPCR - 32Bit Floating-Point Control Register: + * * [31:27] - Reserved, Res0; + * * [26] - AHP, Alternative half-precision control bit; + * * [25] - DN, Default NaN mode control bit; + * * [24] - FZ, Flush-to-zero mode control bit; + * * [23:22] - RMode, Rounding Mode control field: + * * 00 - Round to Nearest (RN) mode; + * * 01 - Round towards Plus Infinity (RP) mode; + * * 10 - Round towards Minus Infinity (RM) mode; + * * 11 - Round towards Zero (RZ) mode. + * * [21:20] - Stride, ignored during AArch64 execution; + * * [19] - Reserved, Res0; + * * [18:16] - Len, ignored during AArch64 execution; + * * [15] - IDE, Input Denormal exception trap; + * * [14:13] - Reserved, Res0; + * * [12] - IXE, Inexact exception trap; + * * [11] - UFE, Underflow exception trap; + * * [10] - OFE, Overflow exception trap; + * * [9] - DZE, Division by Zero exception; + * * [8] - IOE, Invalid Operation exception; + * * [7:0] - Reserved, Res0. + * + * * FPSR - 32Bit Floating-Point Status Register: + * * [31] - N, Negative condition flag for AArch32 (AArch64 sets PSTATE.N); + * * [30] - Z, Zero condition flag for AArch32 (AArch64 sets PSTATE.Z); + * * [29] - C, Carry conditon flag for AArch32 (AArch64 sets PSTATE.C); + * * [28] - V, Overflow conditon flag for AArch32 (AArch64 sets PSTATE.V); + * * [27] - QC, Cumulative saturation bit, Advanced SIMD only; + * * [26:8] - Reserved, Res0; + * * [7] - IDC, Input Denormal cumulative exception; + * * [6:5] - Reserved, Res0; + * * [4] - IXC, Inexact cumulative exception; + * * [3] - UFC, Underflow cumulative exception; + * * [2] - OFC, Overflow cumulative exception; + * * [1] - DZC, Division by Zero cumulative exception; + * * [0] - IOC, Invalid Operation cumulative exception. */ #ifndef _ARM64_FENV_H_ @@ -45,7 +74,11 @@ __BEGIN_DECLS -typedef __uint32_t fenv_t; +typedef struct { + __uint32_t __control; /* FPCR, Floating-point Control Register */ + __uint32_t __status; /* FPSR, Floating-point Status Register */ +} fenv_t; + typedef __uint32_t fexcept_t; /* Exception flags. */ @@ -54,11 +87,9 @@ typedef __uint32_t fexcept_t; #define FE_OVERFLOW 0x04 #define FE_UNDERFLOW 0x08 #define FE_INEXACT 0x10 +#define FE_DENORMAL 0x80 #define FE_ALL_EXCEPT (FE_DIVBYZERO | FE_INEXACT | FE_INVALID | \ - FE_OVERFLOW | FE_UNDERFLOW) - -#define _FPSCR_ENABLE_SHIFT 8 -#define _FPSCR_ENABLE_MASK (FE_ALL_EXCEPT << _FPSCR_ENABLE_SHIFT) + FE_OVERFLOW | FE_UNDERFLOW | FE_DENORMAL) /* Rounding modes. */ #define FE_TONEAREST 0x0 @@ -66,56 +97,6 @@ typedef __uint32_t fexcept_t; #define FE_DOWNWARD 0x2 #define FE_TOWARDZERO 0x3 -#define _FPSCR_RMODE_SHIFT 22 - -#define FPCR_IOE (1 << 8) -#define FPCR_DZE (1 << 9) -#define FPCR_OFE (1 << 10) -#define FPCR_UFE (1 << 11) -#define FPCR_IXE (1 << 12) -#define FPCR_IDE (1 << 15) -#define FPCR_LEN (7 << 16) -#define FPCR_STRIDE (3 << 20) -#define FPCR_RMODE (3 << 22) -#define FPCR_FZ (1 << 24) -#define FPCR_DN (1 << 25) -#define FPCR_AHP (1 << 26) -#define FPCR_MASK (FPCR_IOE | \ - FPCR_DZE | \ - FPCR_OFE | \ - FPCR_UFE | \ - FPCR_IXE | \ - FPCR_IDE | \ - FPCR_LEN | \ - FPCR_STRIDE | \ - FPCR_RMODE | \ - FPCR_FZ | \ - FPCR_DN | \ - FPCR_AHP ) - -#define FPSR_IOC (1 << 0) -#define FPSR_DZC (1 << 1) -#define FPSR_OFC (1 << 2) -#define FPSR_UFC (1 << 3) -#define FPSR_IXC (1 << 4) -#define FPSR_IDC (1 << 7) -#define FPSR_QC (1 << 27) -#define FPSR_V (1 << 28) -#define FPSR_C (1 << 29) -#define FPSR_Z (1 << 30) -#define FPSR_N (1 << 31) -#define FPSR_MASK (FPSR_IOC | \ - FPSR_DZC | \ - FPSR_OFC | \ - FPSR_UFC | \ - FPSR_IXC | \ - FPSR_IDC | \ - FPSR_QC | \ - FPSR_V | \ - FPSR_C | \ - FPSR_Z | \ - FPSR_N ) - __END_DECLS #endif /* !_ARM64_FENV_H_ */