bionic/libm/x86_64/e_exp.S
Jingwei Zhang 5d4f0e6a26 Add the optimized implementation of 18 math functions for x86 and x86_64 respectively
Change-Id: I31bf601448a9427f825517f3a0ff24de47f49bfa
Signed-off-by: Jingwei Zhang <jingwei.zhang@intel.com>
Signed-off-by: Mingwei Shi <mingwei.shi@intel.com>
2015-03-09 13:19:08 -07:00

637 lines
14 KiB
ArmAsm

/*
Copyright (c) 2014, Intel Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* Neither the name of Intel Corporation nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/******************************************************************************/
// ALGORITHM DESCRIPTION
// ---------------------
//
// Description:
// Let K = 64 (table size).
// x x/log(2) n
// e = 2 = 2 * T[j] * (1 + P(y))
// where
// x = m*log(2)/K + y, y in [-log(2)/K..log(2)/K]
// m = n*K + j, m,n,j - signed integer, j in [-K/2..K/2]
// j/K
// values of 2 are tabulated as T[j] = T_hi[j] ( 1 + T_lo[j]).
//
// P(y) is a minimax polynomial approximation of exp(x)-1
// on small interval [-log(2)/K..log(2)/K] (were calculated by Maple V).
//
// To avoid problems with arithmetic overflow and underflow,
// n n1 n2
// value of 2 is safely computed as 2 * 2 where n1 in [-BIAS/2..BIAS/2]
// where BIAS is a value of exponent bias.
//
// Special cases:
// exp(NaN) = NaN
// exp(+INF) = +INF
// exp(-INF) = 0
// exp(x) = 1 for subnormals
// for finite argument, only exp(0)=1 is exact
// For IEEE double
// if x > 709.782712893383973096 then exp(x) overflow
// if x < -745.133219101941108420 then exp(x) underflow
//
/******************************************************************************/
#include <private/bionic_asm.h>
# -- Begin exp
ENTRY(exp)
# parameter 1: %xmm0
..B1.1:
..___tag_value_exp.1:
subq $24, %rsp
..___tag_value_exp.3:
movsd %xmm0, 8(%rsp)
..B1.2:
unpcklpd %xmm0, %xmm0
movapd cv(%rip), %xmm1
movapd Shifter(%rip), %xmm6
movapd 16+cv(%rip), %xmm2
movapd 32+cv(%rip), %xmm3
pextrw $3, %xmm0, %eax
andl $32767, %eax
movl $16527, %edx
subl %eax, %edx
subl $15504, %eax
orl %eax, %edx
cmpl $-2147483648, %edx
jae .L_2TAG_PACKET_0.0.2
mulpd %xmm0, %xmm1
addpd %xmm6, %xmm1
movapd %xmm1, %xmm7
subpd %xmm6, %xmm1
mulpd %xmm1, %xmm2
movapd 64+cv(%rip), %xmm4
mulpd %xmm1, %xmm3
movapd 80+cv(%rip), %xmm5
subpd %xmm2, %xmm0
movd %xmm7, %eax
movl %eax, %ecx
andl $63, %ecx
shll $4, %ecx
sarl $6, %eax
movl %eax, %edx
movdqa mmask(%rip), %xmm6
pand %xmm6, %xmm7
movdqa bias(%rip), %xmm6
paddq %xmm6, %xmm7
psllq $46, %xmm7
subpd %xmm3, %xmm0
lea Tbl_addr(%rip), %r8
movapd (%rcx,%r8), %xmm2
mulpd %xmm0, %xmm4
movapd %xmm0, %xmm6
movapd %xmm0, %xmm1
mulpd %xmm6, %xmm6
mulpd %xmm6, %xmm0
addpd %xmm4, %xmm5
mulsd %xmm6, %xmm0
mulpd 48+cv(%rip), %xmm6
addsd %xmm2, %xmm1
unpckhpd %xmm2, %xmm2
mulpd %xmm5, %xmm0
addsd %xmm0, %xmm1
orpd %xmm7, %xmm2
unpckhpd %xmm0, %xmm0
addsd %xmm1, %xmm0
addsd %xmm6, %xmm0
addl $894, %edx
cmpl $1916, %edx
ja .L_2TAG_PACKET_1.0.2
mulsd %xmm2, %xmm0
addsd %xmm2, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_1.0.2:
xorpd %xmm3, %xmm3
movapd ALLONES(%rip), %xmm4
movl $-1022, %edx
subl %eax, %edx
movd %edx, %xmm5
psllq %xmm5, %xmm4
movl %eax, %ecx
sarl $1, %eax
pinsrw $3, %eax, %xmm3
movapd ebias(%rip), %xmm6
psllq $4, %xmm3
psubd %xmm3, %xmm2
mulsd %xmm2, %xmm0
cmpl $52, %edx
jg .L_2TAG_PACKET_2.0.2
andpd %xmm2, %xmm4
paddd %xmm6, %xmm3
subsd %xmm4, %xmm2
addsd %xmm2, %xmm0
cmpl $1023, %ecx
jge .L_2TAG_PACKET_3.0.2
pextrw $3, %xmm0, %ecx
andl $32768, %ecx
orl %ecx, %edx
cmpl $0, %edx
je .L_2TAG_PACKET_4.0.2
movapd %xmm0, %xmm6
addsd %xmm4, %xmm0
mulsd %xmm3, %xmm0
pextrw $3, %xmm0, %ecx
andl $32752, %ecx
cmpl $0, %ecx
je .L_2TAG_PACKET_5.0.2
jmp ..B1.5
.L_2TAG_PACKET_5.0.2:
mulsd %xmm3, %xmm6
mulsd %xmm3, %xmm4
movq %xmm6, %xmm0
pxor %xmm4, %xmm6
psrad $31, %xmm6
pshufd $85, %xmm6, %xmm6
psllq $1, %xmm0
psrlq $1, %xmm0
pxor %xmm6, %xmm0
psrlq $63, %xmm6
paddq %xmm6, %xmm0
paddq %xmm4, %xmm0
movl $15, (%rsp)
jmp .L_2TAG_PACKET_6.0.2
.L_2TAG_PACKET_4.0.2:
addsd %xmm4, %xmm0
mulsd %xmm3, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_3.0.2:
addsd %xmm4, %xmm0
mulsd %xmm3, %xmm0
pextrw $3, %xmm0, %ecx
andl $32752, %ecx
cmpl $32752, %ecx
jnb .L_2TAG_PACKET_7.0.2
jmp ..B1.5
.L_2TAG_PACKET_2.0.2:
paddd %xmm6, %xmm3
addpd %xmm2, %xmm0
mulsd %xmm3, %xmm0
movl $15, (%rsp)
jmp .L_2TAG_PACKET_6.0.2
.L_2TAG_PACKET_8.0.2:
cmpl $2146435072, %eax
jae .L_2TAG_PACKET_9.0.2
movl 12(%rsp), %eax
cmpl $-2147483648, %eax
jae .L_2TAG_PACKET_10.0.2
movsd XMAX(%rip), %xmm0
mulsd %xmm0, %xmm0
.L_2TAG_PACKET_7.0.2:
movl $14, (%rsp)
jmp .L_2TAG_PACKET_6.0.2
.L_2TAG_PACKET_10.0.2:
movsd XMIN(%rip), %xmm0
mulsd %xmm0, %xmm0
movl $15, (%rsp)
jmp .L_2TAG_PACKET_6.0.2
.L_2TAG_PACKET_9.0.2:
movl 8(%rsp), %edx
cmpl $2146435072, %eax
ja .L_2TAG_PACKET_11.0.2
cmpl $0, %edx
jne .L_2TAG_PACKET_11.0.2
movl 12(%rsp), %eax
cmpl $2146435072, %eax
jne .L_2TAG_PACKET_12.0.2
movsd INF(%rip), %xmm0
jmp ..B1.5
.L_2TAG_PACKET_12.0.2:
movsd ZERO(%rip), %xmm0
jmp ..B1.5
.L_2TAG_PACKET_11.0.2:
movsd 8(%rsp), %xmm0
addsd %xmm0, %xmm0
jmp ..B1.5
.L_2TAG_PACKET_0.0.2:
movl 12(%rsp), %eax
andl $2147483647, %eax
cmpl $1083179008, %eax
jae .L_2TAG_PACKET_8.0.2
movsd 8(%rsp), %xmm0
addsd ONE_val(%rip), %xmm0
jmp ..B1.5
.L_2TAG_PACKET_6.0.2:
movq %xmm0, 16(%rsp)
..B1.3:
movq 16(%rsp), %xmm0
.L_2TAG_PACKET_13.0.2:
..B1.5:
addq $24, %rsp
..___tag_value_exp.4:
ret
..___tag_value_exp.5:
END(exp)
# -- End exp
.section .rodata, "a"
.align 16
.align 16
cv:
.long 1697350398
.long 1079448903
.long 1697350398
.long 1079448903
.long 4277796864
.long 1065758274
.long 4277796864
.long 1065758274
.long 3164486458
.long 1025308570
.long 3164486458
.long 1025308570
.long 4294967294
.long 1071644671
.long 4294967294
.long 1071644671
.long 3811088480
.long 1062650204
.long 1432067621
.long 1067799893
.long 3230715663
.long 1065423125
.long 1431604129
.long 1069897045
.type cv,@object
.size cv,96
.align 16
Shifter:
.long 0
.long 1127743488
.long 0
.long 1127743488
.type Shifter,@object
.size Shifter,16
.align 16
mmask:
.long 4294967232
.long 0
.long 4294967232
.long 0
.type mmask,@object
.size mmask,16
.align 16
bias:
.long 65472
.long 0
.long 65472
.long 0
.type bias,@object
.size bias,16
.align 16
Tbl_addr:
.long 0
.long 0
.long 0
.long 0
.long 235107661
.long 1018002367
.long 1048019040
.long 11418
.long 896005651
.long 1015861842
.long 3541402996
.long 22960
.long 1642514529
.long 1012987726
.long 410360776
.long 34629
.long 1568897900
.long 1016568486
.long 1828292879
.long 46424
.long 1882168529
.long 1010744893
.long 852742562
.long 58348
.long 509852888
.long 1017336174
.long 3490863952
.long 70401
.long 653277307
.long 1017431380
.long 2930322911
.long 82586
.long 1649557430
.long 1017729363
.long 1014845818
.long 94904
.long 1058231231
.long 1015777676
.long 3949972341
.long 107355
.long 1044000607
.long 1016786167
.long 828946858
.long 119943
.long 1151779725
.long 1015705409
.long 2288159958
.long 132667
.long 3819481236
.long 1016499965
.long 1853186616
.long 145530
.long 2552227826
.long 1015039787
.long 1709341917
.long 158533
.long 1829350193
.long 1015216097
.long 4112506593
.long 171677
.long 1913391795
.long 1015756674
.long 2799960843
.long 184965
.long 1303423926
.long 1015238005
.long 171030293
.long 198398
.long 1574172746
.long 1016061241
.long 2992903935
.long 211976
.long 3424156969
.long 1017196428
.long 926591434
.long 225703
.long 1938513547
.long 1017631273
.long 887463926
.long 239579
.long 2804567149
.long 1015390024
.long 1276261410
.long 253606
.long 631083525
.long 1017690182
.long 569847337
.long 267786
.long 1623370770
.long 1011049453
.long 1617004845
.long 282120
.long 3667985273
.long 1013894369
.long 3049340112
.long 296610
.long 3145379760
.long 1014403278
.long 3577096743
.long 311258
.long 2603100681
.long 1017152460
.long 1990012070
.long 326066
.long 3249202951
.long 1017448880
.long 1453150081
.long 341035
.long 419288974
.long 1016280325
.long 917841882
.long 356167
.long 3793507337
.long 1016095713
.long 3712504873
.long 371463
.long 728023093
.long 1016345318
.long 363667784
.long 386927
.long 2582678538
.long 1017123460
.long 2956612996
.long 402558
.long 7592966
.long 1016721543
.long 2186617380
.long 418360
.long 228611441
.long 1016696141
.long 1719614412
.long 434334
.long 2261665670
.long 1017457593
.long 1013258798
.long 450482
.long 544148907
.long 1017323666
.long 3907805043
.long 466805
.long 2383914918
.long 1017143586
.long 1447192520
.long 483307
.long 1176412038
.long 1017267372
.long 1944781190
.long 499988
.long 2882956373
.long 1013312481
.long 919555682
.long 516851
.long 3154077648
.long 1016528543
.long 2571947538
.long 533897
.long 348651999
.long 1016405780
.long 2604962540
.long 551129
.long 3253791412
.long 1015920431
.long 1110089947
.long 568549
.long 1509121860
.long 1014756995
.long 2568320822
.long 586158
.long 2617649212
.long 1017340090
.long 2966275556
.long 603959
.long 553214634
.long 1016457425
.long 2682146383
.long 621954
.long 730975783
.long 1014083580
.long 2191782032
.long 640145
.long 1486499517
.long 1016818996
.long 2069751140
.long 658534
.long 2595788928
.long 1016407932
.long 2990417244
.long 677123
.long 1853053619
.long 1015310724
.long 1434058175
.long 695915
.long 2462790535
.long 1015814775
.long 2572866477
.long 714911
.long 3693944214
.long 1017259110
.long 3092190714
.long 734114
.long 2979333550
.long 1017188654
.long 4076559942
.long 753526
.long 174054861
.long 1014300631
.long 2420883922
.long 773150
.long 816778419
.long 1014197934
.long 3716502172
.long 792987
.long 3507050924
.long 1015341199
.long 777507147
.long 813041
.long 1821514088
.long 1013410604
.long 3706687593
.long 833312
.long 920623539
.long 1016295433
.long 1242007931
.long 853805
.long 2789017511
.long 1014276997
.long 3707479175
.long 874520
.long 3586233004
.long 1015962192
.long 64696965
.long 895462
.long 474650514
.long 1016642419
.long 863738718
.long 916631
.long 1614448851
.long 1014281732
.long 3884662774
.long 938030
.long 2450082086
.long 1016164135
.long 2728693977
.long 959663
.long 1101668360
.long 1015989180
.long 3999357479
.long 981531
.long 835814894
.long 1015702697
.long 1533953344
.long 1003638
.long 1301400989
.long 1014466875
.long 2174652632
.long 1025985
.type Tbl_addr,@object
.size Tbl_addr,1024
.align 16
ALLONES:
.long 4294967295
.long 4294967295
.long 4294967295
.long 4294967295
.type ALLONES,@object
.size ALLONES,16
.align 16
ebias:
.long 0
.long 1072693248
.long 0
.long 1072693248
.type ebias,@object
.size ebias,16
.align 4
XMAX:
.long 4294967295
.long 2146435071
.type XMAX,@object
.size XMAX,8
.align 4
XMIN:
.long 0
.long 1048576
.type XMIN,@object
.size XMIN,8
.align 4
INF:
.long 0
.long 2146435072
.type INF,@object
.size INF,8
.align 4
ZERO:
.long 0
.long 0
.type ZERO,@object
.size ZERO,8
.align 4
ONE_val:
.long 0
.long 1072693248
.type ONE_val,@object
.size ONE_val,8
.data
.section .note.GNU-stack, ""
// -- Begin DWARF2 SEGMENT .eh_frame
.section .eh_frame,"a",@progbits
.eh_frame_seg:
.align 1
.4byte 0x00000014
.8byte 0x00527a0100000000
.8byte 0x08070c1b01107801
.4byte 0x00000190
.4byte 0x0000001c
.4byte 0x0000001c
.4byte ..___tag_value_exp.1-.
.4byte ..___tag_value_exp.5-..___tag_value_exp.1
.2byte 0x0400
.4byte ..___tag_value_exp.3-..___tag_value_exp.1
.2byte 0x200e
.byte 0x04
.4byte ..___tag_value_exp.4-..___tag_value_exp.3
.2byte 0x080e
.byte 0x00
# End