Yet another "teaser" Montgomery multiplication module, for PowerPC.
This commit is contained in:
parent
b46343583c
commit
2c5d4daac5
17
Configure
17
Configure
@ -314,7 +314,7 @@ my %table=(
|
|||||||
# *-generic* is endian-neutral target, but ./config is free to
|
# *-generic* is endian-neutral target, but ./config is free to
|
||||||
# throw in -D[BL]_ENDIAN, whichever appropriate...
|
# throw in -D[BL]_ENDIAN, whichever appropriate...
|
||||||
"linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-generic32","gcc:-DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ppc", "gcc:-DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc32.o linix_ppc32-mont.o:::::::::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
#### IA-32 targets...
|
#### IA-32 targets...
|
||||||
"linux-ia32-icc", "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ia32-icc", "icc:-DL_ENDIAN -DTERMIO -O2 -no_cpprt::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-KPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-elf", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-elf", "gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG ${x86_gcc_des} ${x86_gcc_opts}:${x86_elf_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
@ -322,7 +322,7 @@ my %table=(
|
|||||||
####
|
####
|
||||||
"linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
# -bpowerpc64-linux is transient option, -m64 should be the one to use...
|
# -bpowerpc64-linux is transient option, -m64 should be the one to use...
|
||||||
"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ppc64", "gcc:-bpowerpc64-linux -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL::linux_ppc64.o linux_ppc64-mont.o:::::::::::dlfcn:linux-shared:-fPIC:-bpowerpc64-linux:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
@ -407,12 +407,12 @@ my %table=(
|
|||||||
|
|
||||||
#### IBM's AIX.
|
#### IBM's AIX.
|
||||||
"aix3-cc", "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
|
"aix3-cc", "cc:-O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::BN_LLONG RC4_CHAR:::",
|
||||||
"aix-gcc", "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:",
|
"aix-gcc", "gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:",
|
||||||
"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn::::::-X64",
|
"aix64-gcc","gcc:-O -DB_ENDIAN::-D_THREAD_SAFE:AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn::::::-X64",
|
||||||
# Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
|
# Below targets assume AIX 5. Idea is to effectively disregard $OBJECT_MODE
|
||||||
# at build time. $OBJECT_MODE is respected at ./config stage!
|
# at build time. $OBJECT_MODE is respected at ./config stage!
|
||||||
"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
|
"aix-cc", "cc:-q32 -O -DB_ENDIAN -qmaxmem=16384::-qthreaded:AIX::BN_LLONG RC4_CHAR::aix_ppc32.o aix_ppc32-mont.o:::::::::::dlfcn:aix-shared::-q32:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 32",
|
||||||
"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
|
"aix64-cc", "cc:-q64 -O -DB_ENDIAN -qmaxmem=16384::(unknown):AIX::SIXTY_FOUR_BIT_LONG RC4_CHAR::aix_ppc64.o aix_ppc64-mont.o:::::::::::dlfcn:aix-shared::-q64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)::-X 64",
|
||||||
|
|
||||||
#
|
#
|
||||||
# Cray T90 and similar (SDSC)
|
# Cray T90 and similar (SDSC)
|
||||||
@ -504,9 +504,10 @@ my %table=(
|
|||||||
|
|
||||||
##### MacOS X (a.k.a. Rhapsody or Darwin) setup
|
##### MacOS X (a.k.a. Rhapsody or Darwin) setup
|
||||||
"rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::",
|
"rhapsody-ppc-cc","cc:-O3 -DB_ENDIAN::(unknown):MACOSX_RHAPSODY::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}::",
|
||||||
"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
"darwin-ppc-cc","cc:-O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
||||||
|
"darwin64-ppc-cc","cc:-m64 -O3 -DB_ENDIAN::-D_REENTRANT:MACOSX:-Wl,-search_paths_first:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc64.o osx_ppc64-mont.o:::::::::::dlfcn:darwin-shared:-fPIC -fno-common:-m64 -dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
||||||
"darwin-i386-cc","cc:-O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
"darwin-i386-cc","cc:-O3 -fomit-frame-pointer -DL_ENDIAN::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:${no_asm}:dlfcn:darwin-shared:-fPIC -fno-common:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
||||||
"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
"debug-darwin-ppc-cc","cc:-DBN_DEBUG -DREF_CHECK -DCONF_DEBUG -DCRYPTO_MDEBUG -DB_ENDIAN -g -Wall -O::-D_REENTRANT:MACOSX::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::osx_ppc32.o osx_ppc32-mont.o:::::::::::dlfcn:darwin-shared:-fPIC:-dynamiclib:.\$(SHLIB_MAJOR).\$(SHLIB_MINOR).dylib",
|
||||||
|
|
||||||
##### A/UX
|
##### A/UX
|
||||||
"aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
|
"aux3-gcc","gcc:-O2 -DTERMIO::(unknown):AUX:-lbsd:RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::",
|
||||||
|
@ -120,6 +120,14 @@ linux_ppc64.s: asm/ppc.pl; $(PERL) $< $@
|
|||||||
aix_ppc32.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@
|
aix_ppc32.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@
|
||||||
aix_ppc64.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@
|
aix_ppc64.s: asm/ppc.pl; $(PERL) asm/ppc.pl $@
|
||||||
osx_ppc32.s: asm/ppc.pl; $(PERL) $< $@
|
osx_ppc32.s: asm/ppc.pl; $(PERL) $< $@
|
||||||
|
osx_ppc64.s: asm/ppc.pl; $(PERL) $< $@
|
||||||
|
|
||||||
|
linux_ppc32-mont.s: asm/ppc-mont.pl; $(PERL) $< $@
|
||||||
|
linux_ppc64-mont.s: asm/ppc-mont.pl; $(PERL) $< $@
|
||||||
|
aix_ppc32-mont.s: asm/ppc-mont.pl; $(PERL) asm/ppc-mont.pl $@
|
||||||
|
aix_ppc64-mont.s: asm/ppc-mont.pl; $(PERL) asm/ppc-mont.pl $@
|
||||||
|
osx_ppc32-mont.s: asm/ppc-mont.pl; $(PERL) $< $@
|
||||||
|
osx_ppc64-mont.s: asm/ppc-mont.pl; $(PERL) $< $@
|
||||||
|
|
||||||
files:
|
files:
|
||||||
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
|
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO
|
||||||
|
327
crypto/bn/asm/ppc-mont.pl
Normal file
327
crypto/bn/asm/ppc-mont.pl
Normal file
@ -0,0 +1,327 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
|
||||||
|
# ====================================================================
|
||||||
|
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||||
|
# project. Rights for redistribution and usage in source and binary
|
||||||
|
# forms are granted according to the OpenSSL license.
|
||||||
|
# ====================================================================
|
||||||
|
|
||||||
|
# April 2006
|
||||||
|
|
||||||
|
# "Teaser" Montgomery multiplication module for PowerPC. It's possible
|
||||||
|
# to gain a bit more by modulo-scheduling outer loop, then dedicated
|
||||||
|
# squaring procedure should give further 20% and code can be adapted
|
||||||
|
# for 32-bit application running on 64-bit CPU. As for the latter.
|
||||||
|
# It won't be able to achieve "native" 64-bit performance, because in
|
||||||
|
# 32-bit application context every addc instruction will have to be
|
||||||
|
# expanded as addc, twice right shift by 32 and finally adde, etc.
|
||||||
|
# So far RSA *sign* performance improvement over pre-bn_mul_mont asm
|
||||||
|
# for 64-bit application running on PPC970/G5 is:
|
||||||
|
#
|
||||||
|
# 512-bit +65%
|
||||||
|
# 1024-bit +35%
|
||||||
|
# 2048-bit +18%
|
||||||
|
# 4096-bit +4%
|
||||||
|
|
||||||
|
$output = shift;
|
||||||
|
|
||||||
|
if ($output =~ /32\-mont\.s/) {
|
||||||
|
$BITS= 32;
|
||||||
|
$BNSZ= $BITS/8;
|
||||||
|
$SIZE_T=4;
|
||||||
|
$RZONE= 224;
|
||||||
|
$FRAME= $SIZE_T*16;
|
||||||
|
|
||||||
|
$LD= "lwz"; # load
|
||||||
|
$LDU= "lwzu"; # load and update
|
||||||
|
$LDX= "lwzx"; # load indexed
|
||||||
|
$ST= "stw"; # store
|
||||||
|
$STU= "stwu"; # store and update
|
||||||
|
$STX= "stwx"; # store indexed
|
||||||
|
$STUX= "stwux"; # store indexed and update
|
||||||
|
$UMULL= "mullw"; # unsigned multiply low
|
||||||
|
$UMULH= "mulhwu"; # unsigned multiply high
|
||||||
|
$UCMP= "cmplw"; # unsigned compare
|
||||||
|
$PUSH= $ST;
|
||||||
|
$POP= $LD;
|
||||||
|
} elsif ($output =~ /64\-mont\.s/) {
|
||||||
|
$BITS= 64;
|
||||||
|
$BNSZ= $BITS/8;
|
||||||
|
$SIZE_T=8;
|
||||||
|
$RZONE= 288;
|
||||||
|
$FRAME= $SIZE_T*16;
|
||||||
|
|
||||||
|
# same as above, but 64-bit mnemonics...
|
||||||
|
$LD= "ld"; # load
|
||||||
|
$LDU= "ldu"; # load and update
|
||||||
|
$LDX= "ldx"; # load indexed
|
||||||
|
$ST= "std"; # store
|
||||||
|
$STU= "stdu"; # store and update
|
||||||
|
$STX= "stdx"; # store indexed
|
||||||
|
$STUX= "stdux"; # store indexed and update
|
||||||
|
$UMULL= "mulld"; # unsigned multiply low
|
||||||
|
$UMULH= "mulhdu"; # unsigned multiply high
|
||||||
|
$UCMP= "cmpld"; # unsigned compare
|
||||||
|
$PUSH= $ST;
|
||||||
|
$POP= $LD;
|
||||||
|
} else { die "nonsense $output"; }
|
||||||
|
|
||||||
|
( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
|
||||||
|
die "can't call ../perlasm/ppc-xlate.pl: $!";
|
||||||
|
|
||||||
|
$sp="r1";
|
||||||
|
$toc="r2";
|
||||||
|
$rp="r3"; $ovf="r3";
|
||||||
|
$ap="r4";
|
||||||
|
$bp="r5";
|
||||||
|
$np="r6";
|
||||||
|
$n0="r7";
|
||||||
|
$num="r8";
|
||||||
|
$rp="r9"; # $rp is reassigned
|
||||||
|
$aj="r10";
|
||||||
|
$nj="r11";
|
||||||
|
$tj="r12";
|
||||||
|
# non-volatile registers
|
||||||
|
$i="r14";
|
||||||
|
$j="r15";
|
||||||
|
$tp="r16";
|
||||||
|
$m0="r17";
|
||||||
|
$m1="r18";
|
||||||
|
$lo0="r19";
|
||||||
|
$hi0="r20";
|
||||||
|
$lo1="r21";
|
||||||
|
$hi1="r22";
|
||||||
|
$alo="r23";
|
||||||
|
$ahi="r24";
|
||||||
|
$nlo="r25";
|
||||||
|
#
|
||||||
|
$nhi="r0";
|
||||||
|
|
||||||
|
$code=<<___;
|
||||||
|
.text
|
||||||
|
|
||||||
|
.globl .bn_mul_mont
|
||||||
|
.align 4
|
||||||
|
.bn_mul_mont:
|
||||||
|
cmpwi $num,4
|
||||||
|
mr $rp,r3 ; $rp is reassigned
|
||||||
|
li r3,0
|
||||||
|
bltlr
|
||||||
|
|
||||||
|
slwi $num,$num,`log($BNSZ)/log(2)`
|
||||||
|
li $tj,-4096
|
||||||
|
addi $ovf,$num,`$FRAME+$RZONE`
|
||||||
|
subf $ovf,$ovf,$sp ; $sp-$ovf
|
||||||
|
and $ovf,$ovf,$tj ; minimize TLB usage
|
||||||
|
subf $ovf,$sp,$ovf ; $ovf-$sp
|
||||||
|
srwi $num,$num,`log($BNSZ)/log(2)`
|
||||||
|
$STUX $sp,$sp,$ovf
|
||||||
|
|
||||||
|
$PUSH r14,`4*$SIZE_T`($sp)
|
||||||
|
$PUSH r15,`5*$SIZE_T`($sp)
|
||||||
|
$PUSH r16,`6*$SIZE_T`($sp)
|
||||||
|
$PUSH r17,`7*$SIZE_T`($sp)
|
||||||
|
$PUSH r18,`8*$SIZE_T`($sp)
|
||||||
|
$PUSH r19,`9*$SIZE_T`($sp)
|
||||||
|
$PUSH r20,`10*$SIZE_T`($sp)
|
||||||
|
$PUSH r21,`11*$SIZE_T`($sp)
|
||||||
|
$PUSH r22,`12*$SIZE_T`($sp)
|
||||||
|
$PUSH r23,`13*$SIZE_T`($sp)
|
||||||
|
$PUSH r24,`14*$SIZE_T`($sp)
|
||||||
|
$PUSH r25,`15*$SIZE_T`($sp)
|
||||||
|
|
||||||
|
$LD $n0,0($n0) ; pull n0[0] value
|
||||||
|
addi $num,$num,-2 ; adjust $num for counter register
|
||||||
|
|
||||||
|
$LD $m0,0($bp) ; m0=bp[0]
|
||||||
|
$LD $aj,0($ap) ; ap[0]
|
||||||
|
addi $tp,$sp,$FRAME
|
||||||
|
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
|
||||||
|
$UMULH $hi0,$aj,$m0
|
||||||
|
|
||||||
|
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||||
|
$LD $nj,0($np) ; np[0]
|
||||||
|
|
||||||
|
$UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
|
||||||
|
|
||||||
|
$UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
|
||||||
|
$UMULH $ahi,$aj,$m0
|
||||||
|
|
||||||
|
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||||
|
$UMULH $hi1,$nj,$m1
|
||||||
|
$LD $nj,$BNSZ($np) ; np[1]
|
||||||
|
addc $lo1,$lo1,$lo0
|
||||||
|
addze $hi1,$hi1
|
||||||
|
|
||||||
|
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||||
|
$UMULH $nhi,$nj,$m1
|
||||||
|
|
||||||
|
mtctr $num
|
||||||
|
li $j,`2*$BNSZ`
|
||||||
|
.align 4
|
||||||
|
L1st:
|
||||||
|
$LDX $aj,$ap,$j ; ap[j]
|
||||||
|
$LDX $nj,$np,$j ; np[j]
|
||||||
|
addc $lo0,$alo,$hi0
|
||||||
|
addze $hi0,$ahi
|
||||||
|
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
|
||||||
|
$UMULH $ahi,$aj,$m0
|
||||||
|
|
||||||
|
addc $lo1,$nlo,$hi1
|
||||||
|
addze $hi1,$nhi
|
||||||
|
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||||
|
$UMULH $nhi,$nj,$m1
|
||||||
|
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||||
|
addze $hi1,$hi1
|
||||||
|
$ST $lo1,0($tp) ; tp[j-1]
|
||||||
|
|
||||||
|
addi $j,$j,$BNSZ ; j++
|
||||||
|
addi $tp,$tp,$BNSZ ; tp++
|
||||||
|
bdnz- L1st
|
||||||
|
;L1st
|
||||||
|
addc $lo0,$alo,$hi0
|
||||||
|
addze $hi0,$ahi
|
||||||
|
|
||||||
|
addc $lo1,$nlo,$hi1
|
||||||
|
addze $hi1,$nhi
|
||||||
|
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||||
|
addze $hi1,$hi1
|
||||||
|
$ST $lo1,0($tp) ; tp[j-1]
|
||||||
|
|
||||||
|
li $ovf,0
|
||||||
|
addc $hi1,$hi1,$hi0
|
||||||
|
addze $ovf,$ovf ; upmost overflow bit
|
||||||
|
$ST $hi1,$BNSZ($tp)
|
||||||
|
|
||||||
|
li $i,$BNSZ
|
||||||
|
.align 4
|
||||||
|
Louter:
|
||||||
|
$LDX $m0,$bp,$i ; m0=bp[i]
|
||||||
|
$LD $aj,0($ap) ; ap[0]
|
||||||
|
addi $tp,$sp,$FRAME
|
||||||
|
$LD $tj,$FRAME($sp) ; tp[0]
|
||||||
|
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
|
||||||
|
$UMULH $hi0,$aj,$m0
|
||||||
|
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||||
|
$LD $nj,0($np) ; np[0]
|
||||||
|
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
|
||||||
|
addze $hi0,$hi0
|
||||||
|
|
||||||
|
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
|
||||||
|
|
||||||
|
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||||
|
$UMULH $ahi,$aj,$m0
|
||||||
|
|
||||||
|
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||||
|
$UMULH $hi1,$nj,$m1
|
||||||
|
$LD $nj,$BNSZ($np) ; np[1]
|
||||||
|
addc $lo1,$lo1,$lo0
|
||||||
|
addze $hi1,$hi1
|
||||||
|
|
||||||
|
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||||
|
$UMULH $nhi,$nj,$m1
|
||||||
|
|
||||||
|
mtctr $num
|
||||||
|
li $j,`2*$BNSZ`
|
||||||
|
.align 4
|
||||||
|
Linner:
|
||||||
|
$LDX $aj,$ap,$j ; ap[j]
|
||||||
|
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||||
|
addc $lo0,$alo,$hi0
|
||||||
|
addze $hi0,$ahi
|
||||||
|
$LDX $nj,$np,$j ; np[j]
|
||||||
|
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||||
|
addze $hi0,$hi0
|
||||||
|
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||||
|
$UMULH $ahi,$aj,$m0
|
||||||
|
|
||||||
|
addc $lo1,$nlo,$hi1
|
||||||
|
addze $hi1,$nhi
|
||||||
|
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||||
|
$UMULH $nhi,$nj,$m1
|
||||||
|
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||||
|
addze $hi1,$hi1
|
||||||
|
$ST $lo1,0($tp) ; tp[j-1]
|
||||||
|
|
||||||
|
addi $j,$j,$BNSZ ; j++
|
||||||
|
addi $tp,$tp,$BNSZ ; tp++
|
||||||
|
bdnz- Linner
|
||||||
|
;Linner
|
||||||
|
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||||
|
addc $lo0,$alo,$hi0
|
||||||
|
addze $hi0,$ahi
|
||||||
|
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||||
|
addze $hi0,$hi0
|
||||||
|
|
||||||
|
addc $lo1,$nlo,$hi1
|
||||||
|
addze $hi1,$nhi
|
||||||
|
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||||
|
addze $hi1,$hi1
|
||||||
|
$ST $lo1,0($tp) ; tp[j-1]
|
||||||
|
|
||||||
|
addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
|
||||||
|
li $ovf,0
|
||||||
|
adde $hi1,$hi1,$hi0
|
||||||
|
addze $ovf,$ovf
|
||||||
|
$ST $hi1,$BNSZ($tp)
|
||||||
|
;
|
||||||
|
slwi $tj,$num,`log($BNSZ)/log(2)`
|
||||||
|
$UCMP $i,$tj
|
||||||
|
addi $i,$i,$BNSZ
|
||||||
|
ble- Louter
|
||||||
|
|
||||||
|
addi $num,$num,2 ; restore $num
|
||||||
|
addi $tp,$sp,$FRAME
|
||||||
|
mtctr $num
|
||||||
|
li $j,0
|
||||||
|
|
||||||
|
subfc. $ovf,$j,$ovf ; sets XER[CA]
|
||||||
|
bne Lsub
|
||||||
|
$UCMP $hi1,$nj
|
||||||
|
bge Lsub
|
||||||
|
.align 4
|
||||||
|
Lcopy:
|
||||||
|
$LDX $tj,$tp,$j
|
||||||
|
$STX $tj,$rp,$j
|
||||||
|
$STX $j,$tp,$j ; zap at once
|
||||||
|
addi $j,$j,$BNSZ
|
||||||
|
bdnz- Lcopy
|
||||||
|
|
||||||
|
Lexit:
|
||||||
|
$POP r14,`4*$SIZE_T`($sp)
|
||||||
|
$POP r15,`5*$SIZE_T`($sp)
|
||||||
|
$POP r16,`6*$SIZE_T`($sp)
|
||||||
|
$POP r17,`7*$SIZE_T`($sp)
|
||||||
|
$POP r18,`8*$SIZE_T`($sp)
|
||||||
|
$POP r19,`9*$SIZE_T`($sp)
|
||||||
|
$POP r20,`10*$SIZE_T`($sp)
|
||||||
|
$POP r21,`11*$SIZE_T`($sp)
|
||||||
|
$POP r22,`12*$SIZE_T`($sp)
|
||||||
|
$POP r23,`13*$SIZE_T`($sp)
|
||||||
|
$POP r24,`14*$SIZE_T`($sp)
|
||||||
|
$POP r25,`15*$SIZE_T`($sp)
|
||||||
|
$POP $sp,0($sp)
|
||||||
|
li r3,1
|
||||||
|
blr
|
||||||
|
.long 0
|
||||||
|
.align 4
|
||||||
|
Lsub: $LDX $tj,$tp,$j
|
||||||
|
$LDX $nj,$np,$j
|
||||||
|
subfe $tj,$nj,$tj ; tp[j]-np[j]
|
||||||
|
$STX $tj,$rp,$j
|
||||||
|
addi $j,$j,$BNSZ
|
||||||
|
bdnz- Lsub
|
||||||
|
li $j,0
|
||||||
|
subfe. $ovf,$j,$ovf
|
||||||
|
mtctr $num
|
||||||
|
bne Lcopy
|
||||||
|
.align 4
|
||||||
|
Lzap: $STX $j,$tp,$j
|
||||||
|
addi $j,$j,$BNSZ
|
||||||
|
bdnz- Lzap
|
||||||
|
b Lexit
|
||||||
|
___
|
||||||
|
|
||||||
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||||
|
print $code;
|
||||||
|
close STDOUT;
|
113
crypto/perlasm/ppc-xlate.pl
Executable file
113
crypto/perlasm/ppc-xlate.pl
Executable file
@ -0,0 +1,113 @@
|
|||||||
|
#!/usr/bin/env perl
|
||||||
|
|
||||||
|
# PowerPC assembler distiller by <appro>.
|
||||||
|
|
||||||
|
my $output = shift;
|
||||||
|
open STDOUT,">$output" || die "can't open $output: $!";
|
||||||
|
|
||||||
|
my $flavour = $output;
|
||||||
|
my %GLOBALS;
|
||||||
|
my $dotinlocallabels=0;
|
||||||
|
|
||||||
|
################################################################
|
||||||
|
# directives which need special treatment on different platforms
|
||||||
|
################################################################
|
||||||
|
my $globl = sub {
|
||||||
|
my $junk = shift;
|
||||||
|
my $name = shift;
|
||||||
|
my $global = \$GLOBALS{$name};
|
||||||
|
my $ret;
|
||||||
|
|
||||||
|
$name =~ s|^[\.\_]||;
|
||||||
|
|
||||||
|
SWITCH: for ($flavour) {
|
||||||
|
/aix/ && do { $name = ".$name";
|
||||||
|
last;
|
||||||
|
};
|
||||||
|
/osx/ && do { $name = "_$name";
|
||||||
|
last;
|
||||||
|
};
|
||||||
|
/linux.*32/ && do { $ret .= ".globl $name\n";
|
||||||
|
$ret .= ".type $name,\@function";
|
||||||
|
$dotinlocallabels = 1;
|
||||||
|
last;
|
||||||
|
};
|
||||||
|
/linux.*64/ && do { $ret .= ".globl .$name\n";
|
||||||
|
$ret .= ".type .$name,\@function\n";
|
||||||
|
$ret .= ".section \".opd\",\"aw\"\n";
|
||||||
|
$ret .= ".globl $name\n";
|
||||||
|
$ret .= ".align 3\n";
|
||||||
|
$ret .= "$name:\n";
|
||||||
|
$ret .= ".quad .$name,.TOC.\@tocbase,0\n";
|
||||||
|
$ret .= ".size $name,24\n";
|
||||||
|
$ret .= ".previous\n";
|
||||||
|
|
||||||
|
$name = ".$name";
|
||||||
|
$dotinlocallabels = 1;
|
||||||
|
last;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
$ret = ".globl $name" if (!$ret);
|
||||||
|
$$global = $name;
|
||||||
|
$ret;
|
||||||
|
};
|
||||||
|
my $machine = sub {
|
||||||
|
my $junk = shift;
|
||||||
|
my $arch = shift;
|
||||||
|
$arch = "ppc970" if ($arch eq "any" and $flavour =~ /osx/);
|
||||||
|
".machine $arch";
|
||||||
|
};
|
||||||
|
|
||||||
|
################################################################
|
||||||
|
# simplified mnemonics not handled by at least one assembler
|
||||||
|
################################################################
|
||||||
|
my $cmplw = sub {
|
||||||
|
my $f = shift;
|
||||||
|
my $cr = 0; $cr = shift if ($#_>1);
|
||||||
|
" cmpl$f ".join(',',$cr,0,@_);
|
||||||
|
};
|
||||||
|
my $cmpld = sub {
|
||||||
|
my $f = shift;
|
||||||
|
my $cr = 0; $cr = shift if ($#_>1);
|
||||||
|
" cmpl$f ".join(',',$cr,1,@_);
|
||||||
|
};
|
||||||
|
my $bdnz = sub {
|
||||||
|
my $f = shift;
|
||||||
|
my $bo = $f=~/[\+\-]/ ? 17 : 16;
|
||||||
|
" bc $bo,0,".shift;
|
||||||
|
};
|
||||||
|
|
||||||
|
while($line=<>) {
|
||||||
|
|
||||||
|
$line =~ s|[#!;].*$||; # get rid of asm-style comments...
|
||||||
|
$line =~ s|/\*.*\*/||; # ... and C-style comments...
|
||||||
|
$line =~ s|^\s+||; # ... and skip white spaces in beginning...
|
||||||
|
$line =~ s|\s+$||; # ... and at the end
|
||||||
|
|
||||||
|
{
|
||||||
|
$line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel
|
||||||
|
$line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
$line =~ s|(^[\.\w]+)\:\s*||;
|
||||||
|
my $label = $1;
|
||||||
|
printf "%s:",($GLOBALS{$label} or $label) if ($label);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
$line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||;
|
||||||
|
my $c = $1; $c = "\t" if ($c eq "");
|
||||||
|
my $mnemonic = $2;
|
||||||
|
my $f = $3;
|
||||||
|
my $opcode = eval("\$$mnemonic");
|
||||||
|
if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
|
||||||
|
elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; }
|
||||||
|
}
|
||||||
|
|
||||||
|
print $line if ($line);
|
||||||
|
print "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
close STDOUT;
|
Loading…
x
Reference in New Issue
Block a user