Switch 64-bit sparcv9 platforms from bn(64,64) to bn(64,32). This doesn't
have impact on performance, because amount of multiplications does not increase with this switch, not on sparcv9 that is. On the contrary, it actually improves performance, because it spares a load of instructions used to chase carries. Not to mention that BN assembler modules can be shared more freely between 32- and 64-bit builts.
This commit is contained in:
parent
877e8e970c
commit
6df8c74d5b
@ -202,7 +202,7 @@ my %table=(
|
||||
"solaris-sparcv8-gcc","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
# -m32 should be safe to add as long as driver recognizes -mcpu=ultrasparc
|
||||
"solaris-sparcv9-gcc","gcc:-m32 -mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"solaris64-sparcv9-gcc","gcc:-m64 -mcpu=ultrasparc -O3 -Wall -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-fPIC:-m64 -shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
####
|
||||
"debug-solaris-sparcv8-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"debug-solaris-sparcv9-gcc","gcc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -DPEDANTIC -O -g -mcpu=ultrasparc -pedantic -ansi -Wall -Wshadow -Wno-long-long -D__EXTENSIONS__ -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o:des_enc-sparc.o fcrypt_b.o:::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-fPIC:-shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
@ -214,7 +214,7 @@ my %table=(
|
||||
"solaris-sparcv7-cc","cc:-xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:${no_asm}:dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"solaris-sparcv8-cc","cc:-xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plusa -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9a -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs",
|
||||
"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9a -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:solaris-shared:-KPIC:-xarch=v9 -G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):/usr/ccs/bin/ar rs",
|
||||
####
|
||||
"debug-solaris-sparcv8-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT::-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8.o::::::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"debug-solaris-sparcv9-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DBN_CTX_DEBUG -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-lsocket -lnsl -ldl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR::sparcv8plus.o::::md5-sparcv8plus.o::::::dlfcn:solaris-shared:-KPIC:-G -dy -z text:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
@ -335,7 +335,7 @@ my %table=(
|
||||
# -Wa,-Av8plus should do the trick no matter what.
|
||||
"linux-sparcv9","gcc:-m32 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plusa -DBN_DIV2W::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::sparcv8plus.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv8plus.o::::::dlfcn:linux-shared:-fPIC:-m32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
# GCC 3.1 is a requirement
|
||||
"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"linux64-sparcv9","gcc:-m64 -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT:ULTRASPARC:-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
#### Alpha Linux with GNU C and Compaq C setups
|
||||
# Special notes:
|
||||
# - linux-alpha+bwx-gcc is ment to be used from ./config only. If you
|
||||
@ -365,7 +365,7 @@ my %table=(
|
||||
# -DMD32_REG_T=int doesn't actually belong in sparc64 target, it
|
||||
# simply *happens* to work around a compiler bug in gcc 3.3.3,
|
||||
# triggered by RIPEMD160 code.
|
||||
"BSD-sparc64", "gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"BSD-sparc64", "gcc:-DB_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR::bn_asm.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o::md5-sparcv9.o::::::dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"BSD-ia64", "gcc:-DL_ENDIAN -DTERMIOS -O3 -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
"BSD-x86_64", "gcc:-DL_ENDIAN -DTERMIOS -O3 -DMD32_REG_T=int -Wall::${BSDthreads}:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||
|
||||
|
8
TABLE
8
TABLE
@ -142,7 +142,7 @@ $unistd =
|
||||
$thread_cflag = -pthread -D_THREAD_SAFE -D_REENTRANT
|
||||
$sys_id =
|
||||
$lflags =
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR
|
||||
$bn_ops = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR
|
||||
$cpuid_obj =
|
||||
$bn_obj = bn_asm.o sparcv9a-mont.o
|
||||
$des_obj = des_enc-sparc.o fcrypt_b.o
|
||||
@ -2923,7 +2923,7 @@ $unistd =
|
||||
$thread_cflag = -D_REENTRANT
|
||||
$sys_id = ULTRASPARC
|
||||
$lflags = -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
|
||||
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
|
||||
$cpuid_obj =
|
||||
$bn_obj = bn_asm.o sparcv9a-mont.o
|
||||
$des_obj = des_enc-sparc.o fcrypt_b.o
|
||||
@ -3625,7 +3625,7 @@ $unistd =
|
||||
$thread_cflag = -D_REENTRANT
|
||||
$sys_id = ULTRASPARC
|
||||
$lflags = -lsocket -lnsl -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
|
||||
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
|
||||
$cpuid_obj =
|
||||
$bn_obj = bn_asm.o sparcv9a-mont.o
|
||||
$des_obj = des_enc-sparc.o fcrypt_b.o
|
||||
@ -3652,7 +3652,7 @@ $unistd =
|
||||
$thread_cflag = -D_REENTRANT
|
||||
$sys_id = ULTRASPARC
|
||||
$lflags = -lsocket -lnsl -ldl
|
||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
|
||||
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
|
||||
$cpuid_obj =
|
||||
$bn_obj = bn_asm.o sparcv9a-mont.o
|
||||
$des_obj = des_enc-sparc.o fcrypt_b.o
|
||||
|
@ -138,11 +138,7 @@ $fname:
|
||||
save %sp,-$frame-$locals,%sp
|
||||
sethi %hi(0xffff),$mask
|
||||
or $mask,%lo(0xffff),$mask
|
||||
___
|
||||
$code.=<<___ if ($bits==64);
|
||||
ldx [%i4],$n0 ! $n0 reassigned, remember?
|
||||
___
|
||||
$code.=<<___ if ($bits==32);
|
||||
|
||||
cmp $num,4
|
||||
bl,a,pn %icc,.Lret
|
||||
clr %i0
|
||||
@ -160,8 +156,7 @@ $code.=<<___ if ($bits==32);
|
||||
ld [%i4+4],%o0
|
||||
sllx %o0,32,%o0
|
||||
or %o0,$n0,$n0 ! $n0=n0[1].n0[0]
|
||||
___
|
||||
$code.=<<___;
|
||||
|
||||
sll $num,3,$num ! num*=8
|
||||
|
||||
add %sp,$bias,%o0 ! real top of stack
|
||||
@ -188,48 +183,44 @@ $code.=<<___;
|
||||
|
||||
stx %o7,[%sp+$bias+$frame+48] ! save %asi
|
||||
|
||||
sub %g0,$num,$i
|
||||
sub %g0,$num,$j
|
||||
sub %g0,$num,$i ! i=-num
|
||||
sub %g0,$num,$j ! j=-num
|
||||
|
||||
add $ap,$j,%o3
|
||||
add $bp,$i,%o4
|
||||
___
|
||||
$code.=<<___ if ($bits==64);
|
||||
|
||||
ldx [$bp+$i],%o0 ! bp[0]
|
||||
ldx [$ap+$j],%o1 ! ap[0]
|
||||
___
|
||||
$code.=<<___ if ($bits==32);
|
||||
ldd [$bp+$i],%o0 ! bp[0]
|
||||
ldd [$ap+$j],%g2 ! ap[0]
|
||||
sllx %o1,32,%o1
|
||||
sllx %g3,32,%g3
|
||||
or %o0,%o1,%o0
|
||||
or %g2,%g3,%o1
|
||||
___
|
||||
$code.=<<___;
|
||||
sllx %o0,32,%g1
|
||||
sllx %o1,32,%g5
|
||||
srlx %o0,32,%o0
|
||||
srlx %o1,32,%o1
|
||||
or %g1,%o0,%o0
|
||||
or %g5,%o1,%o1
|
||||
|
||||
add $np,$j,%o5
|
||||
|
||||
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
|
||||
mulx $n0,%o0,%o0 ! ap[0]*bp[0]*n0
|
||||
stx %o0,[%sp+$bias+$frame+0]
|
||||
|
||||
ld [%o3+`$bits==32 ? 0 : 4`],$alo_ ! load a[j] as pair of 32-bit words
|
||||
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o3+`$bits==32 ? 4 : 0`],$ahi_
|
||||
ld [%o3+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o5+`$bits==32 ? 0 : 4`],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o5+`$bits==32 ? 4 : 0`],$nhi_
|
||||
ld [%o5+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
! transfer b[i] to FPU as 4x16-bit values
|
||||
ldda [%o4+`$bits==32 ? 2 : 6`]%asi,$ba
|
||||
ldda [%o4+2]%asi,$ba
|
||||
fxtod $alo,$alo
|
||||
ldda [%o4+`$bits==32 ? 0 : 4`]%asi,$bb
|
||||
ldda [%o4+0]%asi,$bb
|
||||
fxtod $ahi,$ahi
|
||||
ldda [%o4+`$bits==32 ? 6 : 2`]%asi,$bc
|
||||
ldda [%o4+6]%asi,$bc
|
||||
fxtod $nlo,$nlo
|
||||
ldda [%o4+`$bits==32 ? 4 : 0`]%asi,$bd
|
||||
ldda [%o4+4]%asi,$bd
|
||||
fxtod $nhi,$nhi
|
||||
|
||||
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
|
||||
@ -256,24 +247,24 @@ $code.=<<___;
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
fmuld $nlo,$nc,$nloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
fmuld $nlo,$nd,$nlod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
fmuld $nhi,$na,$nhia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
fmuld $nhi,$nb,$nhib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
fmuld $nhi,$nc,$nhic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahib,$nhib,$nhib
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
@ -317,13 +308,13 @@ $code.=<<___;
|
||||
.L1st:
|
||||
add $ap,$j,%o3
|
||||
add $np,$j,%o4
|
||||
ld [%o3+`$bits==32 ? 0 : 4`],$alo_ ! load a[j] as pair of 32-bit words
|
||||
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||
fzeros $alo
|
||||
ld [%o3+`$bits==32 ? 4 : 0`],$ahi_
|
||||
ld [%o3+4],$ahi_
|
||||
fzeros $ahi
|
||||
ld [%o4+`$bits==32 ? 0 : 4`],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
ld [%o4+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||
fzeros $nlo
|
||||
ld [%o4+`$bits==32 ? 4 : 0`],$nhi_
|
||||
ld [%o4+4],$nhi_
|
||||
fzeros $nhi
|
||||
|
||||
fxtod $alo,$alo
|
||||
@ -340,23 +331,23 @@ $code.=<<___;
|
||||
std $nhi,[$np_h+$j]
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
fmuld $nlo,$nc,$nloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
fmuld $nlo,$nd,$nlod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
fmuld $nhi,$na,$nhia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
fmuld $nhi,$nb,$nhib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
fmuld $nhi,$nc,$nhic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
fmuld $nhi,$nd,$nhid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $dota,$nloa,$nloa
|
||||
faddd $dotb,$nlob,$nlob
|
||||
@ -429,36 +420,31 @@ $code.=<<___;
|
||||
add $i,8,$i
|
||||
.align 32
|
||||
.Louter:
|
||||
sub %g0,$num,$j
|
||||
sub %g0,$num,$j ! j=-num
|
||||
add %sp,$bias+$frame+$locals,$tp
|
||||
|
||||
add $bp,$i,%o4
|
||||
___
|
||||
$code.=<<___ if ($bits==64);
|
||||
|
||||
ldx [$bp+$i],%o0 ! bp[i]
|
||||
ldx [$ap+$j],%o1 ! ap[0]
|
||||
___
|
||||
$code.=<<___ if ($bits==32);
|
||||
ldd [$bp+$i],%o0 ! bp[i]
|
||||
ldd [$ap+$j],%g2 ! ap[0]
|
||||
sllx %o1,32,%o1
|
||||
sllx %g3,32,%g3
|
||||
or %o0,%o1,%o0
|
||||
or %g2,%g3,%o1
|
||||
___
|
||||
$code.=<<___;
|
||||
sllx %o0,32,%g1
|
||||
sllx %o1,32,%g5
|
||||
srlx %o0,32,%o0
|
||||
srlx %o1,32,%o1
|
||||
or %g1,%o0,%o0
|
||||
or %g5,%o1,%o1
|
||||
|
||||
ldx [$tp],%o2 ! tp[0]
|
||||
mulx %o1,%o0,%o0
|
||||
addcc %o2,%o0,%o0
|
||||
mulx $n0,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
|
||||
stx %o0,[%sp+$bias+$frame+0]
|
||||
|
||||
|
||||
! transfer b[i] to FPU as 4x16-bit values
|
||||
ldda [%o4+`$bits==32 ? 2 : 6`]%asi,$ba
|
||||
ldda [%o4+`$bits==32 ? 0 : 4`]%asi,$bb
|
||||
ldda [%o4+`$bits==32 ? 6 : 2`]%asi,$bc
|
||||
ldda [%o4+`$bits==32 ? 4 : 0`]%asi,$bd
|
||||
ldda [%o4+2]%asi,$ba
|
||||
ldda [%o4+0]%asi,$bb
|
||||
ldda [%o4+6]%asi,$bc
|
||||
ldda [%o4+4]%asi,$bd
|
||||
|
||||
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
|
||||
ldda [%sp+$bias+$frame+6]%asi,$na
|
||||
@ -483,24 +469,24 @@ $code.=<<___;
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
fmuld $nlo,$nc,$nloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
fmuld $nlo,$nd,$nlod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
fmuld $nhi,$na,$nhia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
fmuld $nhi,$nb,$nhib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
fmuld $nhi,$nc,$nhic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahib,$nhib,$nhib
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
faddd $ahid,$nhid,$dotb ! $nhid
|
||||
|
||||
@ -558,24 +544,24 @@ $code.=<<___;
|
||||
fmuld $alo,$bb,$alob
|
||||
fmuld $nlo,$nb,$nlob
|
||||
fmuld $alo,$bc,$aloc
|
||||
fmuld $nlo,$nc,$nloc
|
||||
faddd $aloa,$nloa,$nloa
|
||||
fmuld $nlo,$nc,$nloc
|
||||
fmuld $alo,$bd,$alod
|
||||
fmuld $nlo,$nd,$nlod
|
||||
faddd $alob,$nlob,$nlob
|
||||
fmuld $nlo,$nd,$nlod
|
||||
fmuld $ahi,$ba,$ahia
|
||||
fmuld $nhi,$na,$nhia
|
||||
faddd $aloc,$nloc,$nloc
|
||||
fmuld $nhi,$na,$nhia
|
||||
fmuld $ahi,$bb,$ahib
|
||||
fmuld $nhi,$nb,$nhib
|
||||
faddd $alod,$nlod,$nlod
|
||||
fmuld $nhi,$nb,$nhib
|
||||
fmuld $ahi,$bc,$ahic
|
||||
fmuld $nhi,$nc,$nhic
|
||||
faddd $ahia,$nhia,$nhia
|
||||
fmuld $nhi,$nc,$nhic
|
||||
fmuld $ahi,$bd,$ahid
|
||||
faddd $ahib,$nhib,$nhib
|
||||
fmuld $nhi,$nd,$nhid
|
||||
|
||||
faddd $ahib,$nhib,$nhib
|
||||
faddd $dota,$nloa,$nloa
|
||||
faddd $dotb,$nlob,$nlob
|
||||
faddd $ahic,$nhic,$dota ! $nhic
|
||||
@ -661,7 +647,7 @@ $code.=<<___;
|
||||
add $tp,8,$tp ! adjust tp to point at the end
|
||||
|
||||
ld [$tp-8],%o0
|
||||
ld [$np-`$bits==32 ? 4 : 8`],%o1
|
||||
ld [$np-4],%o1
|
||||
cmp %o0,%o1 ! compare topmost words
|
||||
bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken
|
||||
nop
|
||||
@ -670,41 +656,26 @@ $code.=<<___;
|
||||
.Lsub:
|
||||
ldd [$tp+%o7],%o0
|
||||
ldd [$np+%o7],%o2
|
||||
___
|
||||
$code.=<<___ if ($bits==64);
|
||||
subccc %o1,%o3,%o3
|
||||
subccc %o0,%o2,%o2
|
||||
___
|
||||
$code.=<<___ if ($bits==32);
|
||||
subccc %o1,%o2,%o2
|
||||
subccc %o0,%o3,%o3
|
||||
___
|
||||
$code.=<<___;
|
||||
std %o2,[$rp+%o7]
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lsub
|
||||
nop
|
||||
subccc $carry,0,$carry
|
||||
bcc,pt %icc,.Lzap
|
||||
sub %g0,$num,%o7
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.align 16,0x1000000
|
||||
.Lcopy:
|
||||
ldx [$tp+%o7],%o0
|
||||
___
|
||||
$code.=<<___ if ($bits==64);
|
||||
stx %o0,[$rp+%o7]
|
||||
___
|
||||
$code.=<<___ if ($bits==32);
|
||||
srlx %o0,32,%o1
|
||||
std %o0,[$rp+%o7]
|
||||
___
|
||||
$code.=<<___;
|
||||
add %o7,8,%o7
|
||||
brnz,pt %o7,.Lcopy
|
||||
nop
|
||||
ba .Lzap
|
||||
sub %g0,$num,%o7
|
||||
sub %g0,$num,%o7 ! n=-num
|
||||
|
||||
.align 32
|
||||
.Lzap:
|
||||
|
Loading…
Reference in New Issue
Block a user