PA-RISC assembler pack update from HEAD.
This commit is contained in:
parent
5d9bb428bb
commit
1a111921da
1021
crypto/aes/asm/aes-parisc.pl
Normal file
1021
crypto/aes/asm/aes-parisc.pl
Normal file
File diff suppressed because it is too large
Load Diff
993
crypto/bn/asm/parisc-mont.pl
Normal file
993
crypto/bn/asm/parisc-mont.pl
Normal file
@ -0,0 +1,993 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# On PA-7100LC this module performs ~90-50% better, less for longer
|
||||
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
|
||||
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
|
||||
# multiplication, which in turn means that "baseline" performance was
|
||||
# optimal in respect to instruction set capabilities. Fair comparison
|
||||
# with vendor compiler is problematic, because OpenSSL doesn't define
|
||||
# BN_LLONG [presumably] for historical reasons, which drives compiler
|
||||
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
|
||||
# shifts and additions] instead. This means that you should observe
|
||||
# several times improvement over code generated by vendor compiler
|
||||
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
|
||||
# improvement coefficient was never collected on PA-7100LC, or any
|
||||
# other 1.1 CPU, because I don't have access to such machine with
|
||||
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
|
||||
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
|
||||
# of ~5x on PA-8600.
|
||||
#
|
||||
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
|
||||
# reportedly ~2x faster than vendor compiler generated code [according
|
||||
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
|
||||
# this implementation is actually 32-bit one, in the sense that it
|
||||
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
|
||||
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
|
||||
# module picks halves of 64-bit values in reverse order and pretends
|
||||
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
|
||||
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
|
||||
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
|
||||
# i.e. there is no "wider" multiplication like on most other 64-bit
|
||||
# platforms. This means that even being effectively 32-bit, this
|
||||
# implementation performs "64-bit" computational task in same amount
|
||||
# of arithmetic operations, most notably multiplications. It requires
|
||||
# more memory references, most notably to tp[num], but this doesn't
|
||||
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
|
||||
# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
|
||||
# it's ~10% better for shortest key length and ~10% worse for longest
|
||||
# one.
|
||||
#
|
||||
# In case it wasn't clear. The module has two distinct code paths:
|
||||
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
|
||||
# additions and 64-bit integer loads, not to mention specific
|
||||
# instruction scheduling. In 64-bit build naturally only 2.0 code path
|
||||
# is assembled. In 32-bit application context both code paths are
|
||||
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
|
||||
# is taken automatically. Also, in 32-bit build the module imposes
|
||||
# couple of limitations: vector lengths has to be even and vector
|
||||
# addresses has to be 64-bit aligned. Normally neither is a problem:
|
||||
# most common key lengths are even and vectors are commonly malloc-ed,
|
||||
# which ensures alignment.
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account on
|
||||
# PA-RISC 1.1 machine, and to correspondent who chose to remain
|
||||
# anonymous for testing the code on PA-RISC 2.0 machine.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
$BN_SZ =$SIZE_T;
|
||||
} else {
|
||||
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
$BN_SZ =$SIZE_T;
|
||||
if (open CONF,"<${dir}../../opensslconf.h") {
|
||||
while(<CONF>) {
|
||||
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
|
||||
$BN_SZ=8;
|
||||
$LEVEL="2.0";
|
||||
last;
|
||||
}
|
||||
}
|
||||
close CONF;
|
||||
}
|
||||
}
|
||||
|
||||
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$LOCALS=$FRAME-$FRAME_MARKER;
|
||||
$FRAME+=32; # local variables
|
||||
|
||||
$tp="%r31";
|
||||
$ti1="%r29";
|
||||
$ti0="%r28";
|
||||
|
||||
$rp="%r26";
|
||||
$ap="%r25";
|
||||
$bp="%r24";
|
||||
$np="%r23";
|
||||
$n0="%r22"; # passed through stack in 32-bit
|
||||
$num="%r21"; # passed through stack in 32-bit
|
||||
$idx="%r20";
|
||||
$arrsz="%r19";
|
||||
|
||||
$nm1="%r7";
|
||||
$nm0="%r6";
|
||||
$ab1="%r5";
|
||||
$ab0="%r4";
|
||||
|
||||
$fp="%r3";
|
||||
$hi1="%r2";
|
||||
$hi0="%r1";
|
||||
|
||||
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
|
||||
|
||||
$fm0="%fr4"; $fti=$fm0;
|
||||
$fbi="%fr5L";
|
||||
$fn0="%fr5R";
|
||||
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
|
||||
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
||||
.ALIGN 64
|
||||
bn_mul_mont
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
ldo -$FRAME(%sp),$fp
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
ldw `-$FRAME_MARKER-4`($fp),$n0
|
||||
ldw `-$FRAME_MARKER-8`($fp),$num
|
||||
nop
|
||||
nop ; alignment
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
comiclr,<= 6,$num,%r0 ; are vectors long enough?
|
||||
b L\$abort
|
||||
ldi 0,%r28 ; signal "unhandled"
|
||||
add,ev %r0,$num,$num ; is $num even?
|
||||
b L\$abort
|
||||
nop
|
||||
or $ap,$np,$ti1
|
||||
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
|
||||
b L\$abort
|
||||
nop
|
||||
nop ; alignment
|
||||
nop
|
||||
|
||||
fldws 0($n0),${fn0}
|
||||
fldws,ma 4($bp),${fbi} ; bp[0]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
comib,> 3,$num,L\$abort ; are vectors long enough?
|
||||
ldi 0,%r28 ; signal "unhandled"
|
||||
addl $num,$num,$num ; I operate on 32-bit values
|
||||
|
||||
fldws 4($n0),${fn0} ; only low part of n0
|
||||
fldws 4($bp),${fbi} ; bp[0] in flipped word order
|
||||
___
|
||||
$code.=<<___;
|
||||
fldds 0($ap),${fai} ; ap[0,1]
|
||||
fldds 0($np),${fni} ; np[0,1]
|
||||
|
||||
sh2addl $num,%r0,$arrsz
|
||||
ldi 31,$hi0
|
||||
ldo 36($arrsz),$hi1 ; space for tp[num+1]
|
||||
andcm $hi1,$hi0,$hi1 ; align
|
||||
addl $hi1,%sp,%sp
|
||||
$PUSH $fp,-$SIZE_T(%sp)
|
||||
|
||||
ldo `$LOCALS+16`($fp),$xfer
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
addl $arrsz,$ap,$ap ; point at the end
|
||||
addl $arrsz,$np,$np
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
ldo 8($idx),$idx ; j++++
|
||||
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
fstds ${fab1},0($xfer)
|
||||
fstds ${fnm1},8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2,3]
|
||||
flddx $idx($np),${fni} ; np[2,3]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
mtctl $hi0,%cr11 ; $hi0 still holds 31
|
||||
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
|
||||
b L\$parisc11
|
||||
nop
|
||||
___
|
||||
$code.=<<___; # PA-RISC 2.0 code-path
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
ldo 8($idx),$idx ; j++++
|
||||
addl $ab0,$nm0,$nm0 ; low part is discarded
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
L\$1st
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
addl $hi1,$nm1,$nm1
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
addl $hi0,$ab0,$ab0
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
addl $ab0,$nm0,$nm0
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$1st ; j++++
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
addl $hi1,$nm1,$nm1
|
||||
ldd -16($xfer),$ab0
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldd -8($xfer),$nm0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
addl $hi0,$ab0,$ab0
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
ldd 0($xfer),$ab1
|
||||
addl $ab0,$nm0,$nm0
|
||||
ldd,mb 8($xfer),$nm1
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
|
||||
ldo -1($num),$num ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
fldws,ma 4($bp),${fbi} ; bp[1]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
fldws 0($bp),${fbi} ; bp[1] in flipped word order
|
||||
___
|
||||
$code.=<<___;
|
||||
flddx $idx($ap),${fai} ; ap[0,1]
|
||||
flddx $idx($np),${fni} ; np[0,1]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
addl $hi0,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
addl $hi1,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
L\$outer
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer) ; 33-bit value
|
||||
fstds ${fnm0},-8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2]
|
||||
flddx $idx($np),${fni} ; np[2]
|
||||
ldo 8($idx),$idx ; j++++
|
||||
ldd -16($xfer),$ab0 ; 33-bit value
|
||||
ldd -8($xfer),$nm0
|
||||
ldw 0($xfer),$hi0 ; high part
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
extrd,u $ab0,31,32,$ti0 ; carry bit
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $ti0,$hi0,$hi0 ; account carry bit
|
||||
fstds ${fnm1},8($xfer)
|
||||
addl $ab0,$nm0,$nm0 ; low part is discarded
|
||||
ldw 0($tp),$ti1 ; tp[1]
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
|
||||
L\$inner
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ti1,$ti1
|
||||
addl $ti1,$ab1,$ab1
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldd -16($xfer),$ab0
|
||||
fstds ${fab0},-16($xfer)
|
||||
addl $hi0,$ti0,$ti0
|
||||
addl $ti0,$ab0,$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
addl $ab0,$nm0,$nm0
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$inner ; j++++
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldd 0($xfer),$ab1
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $hi0,$ti1,$ti1
|
||||
addl $ti1,$ab1,$ab1
|
||||
ldd 8($xfer),$nm1
|
||||
fstds ${fnm1},8($xfer)
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
ldd -16($xfer),$ab0
|
||||
ldd -8($xfer),$nm0
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
|
||||
addl $hi0,$ab0,$ab0
|
||||
addl $ti0,$ab0,$ab0
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
extrd,u $ab0,31,32,$hi0
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
extrd,u $ab0,63,32,$ab0
|
||||
addl $hi1,$nm0,$nm0
|
||||
ldd 0($xfer),$ab1
|
||||
addl $ab0,$nm0,$nm0
|
||||
ldd,mb 8($xfer),$nm1
|
||||
extrd,u $nm0,31,32,$hi1
|
||||
stw,ma $nm0,8($tp) ; tp[j-1]
|
||||
|
||||
addib,= -1,$num,L\$outerdone ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
fldws,ma 4($bp),${fbi} ; bp[i]
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldi 12,$ti0 ; bp[i] in flipped word order
|
||||
addl,ev %r0,$num,$num
|
||||
ldi -4,$ti0
|
||||
addl $ti0,$bp,$bp
|
||||
fldws 0($bp),${fbi}
|
||||
___
|
||||
$code.=<<___;
|
||||
flddx $idx($ap),${fai} ; ap[0]
|
||||
addl $hi0,$ab1,$ab1
|
||||
flddx $idx($np),${fni} ; np[0]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
addl $ti1,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
addl $hi1,$nm1,$nm1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
addl $hi1,$hi0,$hi0
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
addl $ti0,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
b L\$outer
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
L\$outerdone
|
||||
addl $hi0,$ab1,$ab1
|
||||
addl $ti1,$ab1,$ab1
|
||||
extrd,u $ab1,31,32,$hi0
|
||||
extrd,u $ab1,63,32,$ab1
|
||||
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
addl $hi1,$nm1,$nm1
|
||||
addl $ab1,$nm1,$nm1
|
||||
extrd,u $nm1,31,32,$hi1
|
||||
stw $nm1,-4($tp) ; tp[j-1]
|
||||
|
||||
addl $hi1,$hi0,$hi0
|
||||
addl $ti0,$hi0,$hi0
|
||||
extrd,u $hi0,31,32,$hi1
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
sub %r0,%r0,%r0 ; clear borrow
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==4);
|
||||
ldws,ma 4($tp),$ti0
|
||||
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
|
||||
b L\$sub_pa11
|
||||
addl $tp,$arrsz,$tp
|
||||
L\$sub
|
||||
ldwx $idx($np),$hi0
|
||||
subb $ti0,$hi0,$hi1
|
||||
ldwx $idx($tp),$ti0
|
||||
addib,<> 4,$idx,L\$sub
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
___
|
||||
$code.=<<___ if ($BN_SZ==8);
|
||||
ldd,ma 8($tp),$ti0
|
||||
L\$sub
|
||||
ldd $idx($np),$hi0
|
||||
shrpd $ti0,$ti0,32,$ti0 ; flip word order
|
||||
std $ti0,-8($tp) ; save flipped value
|
||||
sub,db $ti0,$hi0,$hi1
|
||||
ldd,ma 8($tp),$ti0
|
||||
addib,<> 8,$idx,L\$sub
|
||||
std,ma $hi1,8($rp)
|
||||
|
||||
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
|
||||
sub,db $ti0,%r0,$hi1
|
||||
ldo -8($tp),$tp
|
||||
___
|
||||
$code.=<<___;
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy
|
||||
ldd $idx($np),$hi0
|
||||
std,ma %r0,8($tp)
|
||||
addib,<> 8,$idx,.-8 ; L\$copy
|
||||
std,ma $hi0,8($rp)
|
||||
___
|
||||
|
||||
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
|
||||
$ablo=$ab0;
|
||||
$abhi=$ab1;
|
||||
$nmlo0=$nm0;
|
||||
$nmhi0=$nm1;
|
||||
$nmlo1="%r9";
|
||||
$nmhi1="%r8";
|
||||
|
||||
$code.=<<___;
|
||||
b L\$done
|
||||
nop
|
||||
|
||||
.ALIGN 8
|
||||
L\$parisc11
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -12($xfer),$ablo
|
||||
ldw -16($xfer),$hi0
|
||||
ldw -4($xfer),$nmlo0
|
||||
ldw -8($xfer),$nmhi0
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
add $ablo,$nmlo0,$nmlo0 ; discarded
|
||||
addc %r0,$nmhi0,$hi1
|
||||
ldw 4($xfer),$ablo
|
||||
ldw 0($xfer),$abhi
|
||||
nop
|
||||
|
||||
L\$1st_pa11
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 12($xfer),$nmlo1
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 8($xfer),$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
fstds ${fab1},0($xfer)
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -12($xfer),$ablo
|
||||
addc %r0,$nmhi1,$hi1
|
||||
ldw -16($xfer),$abhi
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
|
||||
ldw -4($xfer),$nmlo0
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -8($xfer),$nmhi0
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fab0},-16($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$1st_pa11 ; j++++
|
||||
addc %r0,$nmhi0,$hi1
|
||||
|
||||
ldw 8($xfer),$nmhi1
|
||||
ldw 12($xfer),$nmlo1
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
add $hi0,$ablo,$ablo
|
||||
fstds ${fab1},0($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
ldw -16($xfer),$abhi
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -12($xfer),$ablo
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -8($xfer),$nmhi0
|
||||
addc %r0,$nmhi1,$hi1
|
||||
ldw -4($xfer),$nmlo0
|
||||
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldws,mb 8($xfer),$nmhi1
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$nmlo1
|
||||
addc %r0,$nmhi0,$hi1
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
|
||||
ldo -1($num),$num ; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
|
||||
fldws,ma 4($bp),${fbi} ; bp[1]
|
||||
flddx $idx($ap),${fai} ; ap[0,1]
|
||||
flddx $idx($np),${fni} ; np[0,1]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
L\$outer_pa11
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
|
||||
fstds ${fab0},-16($xfer) ; 33-bit value
|
||||
fstds ${fnm0},-8($xfer)
|
||||
flddx $idx($ap),${fai} ; ap[2,3]
|
||||
flddx $idx($np),${fni} ; np[2,3]
|
||||
ldw -16($xfer),$abhi ; carry bit actually
|
||||
ldo 8($idx),$idx ; j++++
|
||||
ldw -12($xfer),$ablo
|
||||
ldw -8($xfer),$nmhi0
|
||||
ldw -4($xfer),$nmlo0
|
||||
ldw 0($xfer),$hi0 ; high part
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
fstds ${fab1},0($xfer)
|
||||
addl $abhi,$hi0,$hi0 ; account carry bit
|
||||
fstds ${fnm1},8($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0 ; discarded
|
||||
ldw 0($tp),$ti1 ; tp[1]
|
||||
addc %r0,$nmhi0,$hi1
|
||||
fstds ${fab0},-16($xfer)
|
||||
fstds ${fnm0},-8($xfer)
|
||||
ldw 4($xfer),$ablo
|
||||
ldw 0($xfer),$abhi
|
||||
|
||||
L\$inner_pa11
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
|
||||
flddx $idx($ap),${fai} ; ap[j,j+1]
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
|
||||
flddx $idx($np),${fni} ; np[j,j+1]
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addc %r0,$abhi,$abhi
|
||||
ldw 12($xfer),$nmlo1
|
||||
add $ti1,$ablo,$ablo
|
||||
ldw 8($xfer),$nmhi1
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fab1},0($xfer)
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
fstds ${fnm1},8($xfer)
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -12($xfer),$ablo
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -16($xfer),$abhi
|
||||
addc %r0,$nmhi1,$hi1
|
||||
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
|
||||
ldw -4($xfer),$nmlo0
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw -8($xfer),$nmhi0
|
||||
addc %r0,$abhi,$abhi
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
add $ti0,$ablo,$ablo
|
||||
fstds ${fab0},-16($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
fstds ${fnm0},-8($xfer)
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
addib,<> 8,$idx,L\$inner_pa11 ; j++++
|
||||
addc %r0,$nmhi0,$hi1
|
||||
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
|
||||
ldw 12($xfer),$nmlo1
|
||||
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
|
||||
ldw 8($xfer),$nmhi1
|
||||
add $hi0,$ablo,$ablo
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
addc %r0,$abhi,$abhi
|
||||
fstds ${fab1},0($xfer)
|
||||
add $ti1,$ablo,$ablo
|
||||
fstds ${fnm1},8($xfer)
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw -16($xfer),$abhi
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
ldw -12($xfer),$ablo
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
ldw -8($xfer),$nmhi0
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
ldw -4($xfer),$nmlo0
|
||||
addc %r0,$nmhi1,$hi1
|
||||
|
||||
add $hi0,$ablo,$ablo
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
addc %r0,$abhi,$abhi
|
||||
add $ti0,$ablo,$ablo
|
||||
ldw 8($tp),$ti1 ; tp[j]
|
||||
addc %r0,$abhi,$hi0
|
||||
ldw 0($xfer),$abhi
|
||||
add $ablo,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$ablo
|
||||
addc %r0,$nmhi0,$nmhi0
|
||||
ldws,mb 8($xfer),$nmhi1
|
||||
add $hi1,$nmlo0,$nmlo0
|
||||
ldw 4($xfer),$nmlo1
|
||||
addc %r0,$nmhi0,$hi1
|
||||
stws,ma $nmlo0,8($tp) ; tp[j-1]
|
||||
|
||||
addib,= -1,$num,L\$outerdone_pa11; i--
|
||||
subi 0,$arrsz,$idx ; j=0
|
||||
|
||||
fldws,ma 4($bp),${fbi} ; bp[i]
|
||||
flddx $idx($ap),${fai} ; ap[0]
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$abhi
|
||||
flddx $idx($np),${fni} ; np[0]
|
||||
fldws 8($xfer),${fti}R ; tp[0]
|
||||
add $ti1,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
|
||||
ldo 8($idx),$idx ; j++++
|
||||
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
|
||||
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
fstws,mb ${fab0}L,-8($xfer) ; save high part
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
fcpy,sgl %fr0,${fti}L ; zero high part
|
||||
fcpy,sgl %fr0,${fab0}L
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
|
||||
fcnvxf,dbl,dbl ${fab0},${fab0}
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
|
||||
add $ti0,$hi0,$hi0
|
||||
addc %r0,$hi1,$hi1
|
||||
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
xmpyu ${fn0},${fab0}R,${fm0}
|
||||
|
||||
b L\$outer_pa11
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
|
||||
L\$outerdone_pa11
|
||||
add $hi0,$ablo,$ablo
|
||||
addc %r0,$abhi,$abhi
|
||||
add $ti1,$ablo,$ablo
|
||||
addc %r0,$abhi,$hi0
|
||||
|
||||
ldw 4($tp),$ti0 ; tp[j]
|
||||
|
||||
add $hi1,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$nmhi1
|
||||
add $ablo,$nmlo1,$nmlo1
|
||||
addc %r0,$nmhi1,$hi1
|
||||
stw $nmlo1,-4($tp) ; tp[j-1]
|
||||
|
||||
add $hi1,$hi0,$hi0
|
||||
addc %r0,%r0,$hi1
|
||||
add $ti0,$hi0,$hi0
|
||||
addc %r0,$hi1,$hi1
|
||||
stw $hi0,0($tp)
|
||||
stw $hi1,4($tp)
|
||||
|
||||
ldo `$LOCALS+32+4`($fp),$tp
|
||||
sub %r0,%r0,%r0 ; clear borrow
|
||||
ldw -4($tp),$ti0
|
||||
addl $tp,$arrsz,$tp
|
||||
L\$sub_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
subb $ti0,$hi0,$hi1
|
||||
ldwx $idx($tp),$ti0
|
||||
addib,<> 4,$idx,L\$sub_pa11
|
||||
stws,ma $hi1,4($rp)
|
||||
|
||||
subb $ti0,%r0,$hi1
|
||||
ldo -4($tp),$tp
|
||||
and $tp,$hi1,$ap
|
||||
andcm $rp,$hi1,$bp
|
||||
or $ap,$bp,$np
|
||||
|
||||
sub $rp,$arrsz,$rp ; rewind rp
|
||||
subi 0,$arrsz,$idx
|
||||
ldo `$LOCALS+32`($fp),$tp
|
||||
L\$copy_pa11
|
||||
ldwx $idx($np),$hi0
|
||||
stws,ma %r0,4($tp)
|
||||
addib,<> 4,$idx,L\$copy_pa11
|
||||
stws,ma $hi0,4($rp)
|
||||
|
||||
nop ; alignment
|
||||
L\$done
|
||||
___
|
||||
}
|
||||
|
||||
$code.=<<___;
|
||||
ldi 1,%r28 ; signal "handled"
|
||||
ldo $FRAME($fp),%sp ; destroy tp[num+1]
|
||||
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
L\$abort
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
||||
# that it can be compiled with .LEVEL 1.0. It should be noted that I
|
||||
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
||||
# directive...
|
||||
|
||||
my $ldd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "ldd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
|
||||
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
|
||||
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $std = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "std$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
|
||||
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
|
||||
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
|
||||
$opcode|=(1<<5) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<13) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $extrd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "extrd$mod\t$args";
|
||||
|
||||
# I only have ",u" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
||||
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
||||
my $len=32-$3;
|
||||
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
||||
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
||||
my $len=32-$2;
|
||||
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
||||
$opcode |= (1<<13) if ($mod =~ /,\**=/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $shrpd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "shrpd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
||||
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
||||
my $cpos=63-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $sub = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "sub$mod\t$args";
|
||||
|
||||
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
|
||||
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
|
||||
$opcode|=(1<<10); # e1
|
||||
$opcode|=(1<<8); # e2
|
||||
$opcode|=(1<<5); # d
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
sub assemble {
|
||||
my ($mnemonic,$mod,$args)=@_;
|
||||
my $opcode = eval("\$$mnemonic");
|
||||
|
||||
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
# flip word order in 64-bit mode...
|
||||
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
|
||||
# assemble 2.0 instructions in 32-bit mode...
|
||||
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT;
|
224
crypto/pariscid.pl
Normal file
224
crypto/pariscid.pl
Normal file
@ -0,0 +1,224 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$ST ="std";
|
||||
} else {
|
||||
$LEVEL ="1.1";
|
||||
$SIZE_T =4;
|
||||
$ST ="stw";
|
||||
}
|
||||
|
||||
$rp="%r2";
|
||||
$sp="%r30";
|
||||
$rv="%r28";
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT OPENSSL_cpuid_setup,ENTRY
|
||||
.ALIGN 8
|
||||
OPENSSL_cpuid_setup
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
bv ($rp)
|
||||
.EXIT
|
||||
nop
|
||||
.PROCEND
|
||||
|
||||
.EXPORT OPENSSL_rdtsc,ENTRY
|
||||
.ALIGN 8
|
||||
OPENSSL_rdtsc
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
mfctl %cr16,$rv
|
||||
bv ($rp)
|
||||
.EXIT
|
||||
nop
|
||||
.PROCEND
|
||||
|
||||
.EXPORT OPENSSL_wipe_cpu,ENTRY
|
||||
.ALIGN 8
|
||||
OPENSSL_wipe_cpu
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
xor %r0,%r0,%r1
|
||||
fcpy,dbl %fr0,%fr4
|
||||
xor %r0,%r0,%r19
|
||||
fcpy,dbl %fr0,%fr5
|
||||
xor %r0,%r0,%r20
|
||||
fcpy,dbl %fr0,%fr6
|
||||
xor %r0,%r0,%r21
|
||||
fcpy,dbl %fr0,%fr7
|
||||
xor %r0,%r0,%r22
|
||||
fcpy,dbl %fr0,%fr8
|
||||
xor %r0,%r0,%r23
|
||||
fcpy,dbl %fr0,%fr9
|
||||
xor %r0,%r0,%r24
|
||||
fcpy,dbl %fr0,%fr10
|
||||
xor %r0,%r0,%r25
|
||||
fcpy,dbl %fr0,%fr11
|
||||
xor %r0,%r0,%r26
|
||||
fcpy,dbl %fr0,%fr22
|
||||
xor %r0,%r0,%r29
|
||||
fcpy,dbl %fr0,%fr23
|
||||
xor %r0,%r0,%r31
|
||||
fcpy,dbl %fr0,%fr24
|
||||
fcpy,dbl %fr0,%fr25
|
||||
fcpy,dbl %fr0,%fr26
|
||||
fcpy,dbl %fr0,%fr27
|
||||
fcpy,dbl %fr0,%fr28
|
||||
fcpy,dbl %fr0,%fr29
|
||||
fcpy,dbl %fr0,%fr30
|
||||
fcpy,dbl %fr0,%fr31
|
||||
bv ($rp)
|
||||
.EXIT
|
||||
ldo 0($sp),$rv
|
||||
.PROCEND
|
||||
___
|
||||
{
|
||||
my $inp="%r26";
|
||||
my $len="%r25";
|
||||
|
||||
$code.=<<___;
|
||||
.EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
|
||||
.ALIGN 8
|
||||
OPENSSL_cleanse
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
cmpib,*= 0,$len,Ldone
|
||||
nop
|
||||
cmpib,*>>= 15,$len,Little
|
||||
ldi $SIZE_T-1,%r1
|
||||
|
||||
Lalign
|
||||
and,*<> $inp,%r1,%r28
|
||||
b,n Laligned
|
||||
stb %r0,0($inp)
|
||||
ldo -1($len),$len
|
||||
b Lalign
|
||||
ldo 1($inp),$inp
|
||||
|
||||
Laligned
|
||||
andcm $len,%r1,%r28
|
||||
Lot
|
||||
$ST %r0,0($inp)
|
||||
addib,*<> -$SIZE_T,%r28,Lot
|
||||
ldo $SIZE_T($inp),$inp
|
||||
|
||||
and,*<> $len,%r1,$len
|
||||
b,n Ldone
|
||||
Little
|
||||
stb %r0,0($inp)
|
||||
addib,*<> -1,$len,Little
|
||||
ldo 1($inp),$inp
|
||||
Ldone
|
||||
bv ($rp)
|
||||
.EXIT
|
||||
nop
|
||||
.PROCEND
|
||||
___
|
||||
}
|
||||
{
|
||||
my ($out,$cnt,$max)=("%r26","%r25","%r24");
|
||||
my ($tick,$lasttick)=("%r23","%r22");
|
||||
my ($diff,$lastdiff)=("%r21","%r20");
|
||||
|
||||
$code.=<<___;
|
||||
.EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
|
||||
.ALIGN 8
|
||||
OPENSSL_instrument_bus
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
copy $cnt,$rv
|
||||
mfctl %cr16,$tick
|
||||
copy $tick,$lasttick
|
||||
ldi 0,$diff
|
||||
|
||||
fdc 0($out)
|
||||
ldw 0($out),$tick
|
||||
add $diff,$tick,$tick
|
||||
stw $tick,0($out)
|
||||
Loop
|
||||
mfctl %cr16,$tick
|
||||
sub $tick,$lasttick,$diff
|
||||
copy $tick,$lasttick
|
||||
|
||||
fdc 0($out)
|
||||
ldw 0($out),$tick
|
||||
add $diff,$tick,$tick
|
||||
stw $tick,0($out)
|
||||
|
||||
addib,<> -1,$cnt,Loop
|
||||
addi 4,$out,$out
|
||||
|
||||
bv ($rp)
|
||||
.EXIT
|
||||
sub $rv,$cnt,$rv
|
||||
.PROCEND
|
||||
|
||||
.EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
|
||||
.ALIGN 8
|
||||
OPENSSL_instrument_bus2
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
copy $cnt,$rv
|
||||
sub %r0,$cnt,$cnt
|
||||
|
||||
mfctl %cr16,$tick
|
||||
copy $tick,$lasttick
|
||||
ldi 0,$diff
|
||||
|
||||
fdc 0($out)
|
||||
ldw 0($out),$tick
|
||||
add $diff,$tick,$tick
|
||||
stw $tick,0($out)
|
||||
|
||||
mfctl %cr16,$tick
|
||||
sub $tick,$lasttick,$diff
|
||||
copy $tick,$lasttick
|
||||
Loop2
|
||||
copy $diff,$lastdiff
|
||||
fdc 0($out)
|
||||
ldw 0($out),$tick
|
||||
add $diff,$tick,$tick
|
||||
stw $tick,0($out)
|
||||
|
||||
addib,= -1,$max,Ldone2
|
||||
nop
|
||||
|
||||
mfctl %cr16,$tick
|
||||
sub $tick,$lasttick,$diff
|
||||
copy $tick,$lasttick
|
||||
cmpclr,<> $lastdiff,$diff,$tick
|
||||
ldi 1,$tick
|
||||
|
||||
ldi 1,%r1
|
||||
xor %r1,$tick,$tick
|
||||
addb,<> $tick,$cnt,Loop2
|
||||
shladd,l $tick,2,$out,$out
|
||||
Ldone2
|
||||
bv ($rp)
|
||||
.EXIT
|
||||
add $rv,$cnt,$rv
|
||||
.PROCEND
|
||||
___
|
||||
}
|
||||
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
|
||||
$code =~ s/,\*/,/gm if ($SIZE_T==4);
|
||||
print $code;
|
||||
close STDOUT;
|
||||
|
313
crypto/rc4/asm/rc4-parisc.pl
Normal file
313
crypto/rc4/asm/rc4-parisc.pl
Normal file
@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# RC4 for PA-RISC.
|
||||
|
||||
# June 2009.
|
||||
#
|
||||
# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
|
||||
# For reference, [4x] unrolled loop is >40% faster than folded one.
|
||||
# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
|
||||
# is believed to be not sufficient to justify the effort...
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
} else {
|
||||
$LEVEL ="1.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
}
|
||||
|
||||
$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$SZ=1; # defaults to RC4_CHAR
|
||||
if (open CONF,"<${dir}../../opensslconf.h") {
|
||||
while(<CONF>) {
|
||||
if (m/#\s*define\s+RC4_INT\s+(.*)/) {
|
||||
$SZ = ($1=~/char$/) ? 1 : 4;
|
||||
last;
|
||||
}
|
||||
}
|
||||
close CONF;
|
||||
}
|
||||
|
||||
if ($SZ==1) { # RC4_CHAR
|
||||
$LD="ldb";
|
||||
$LDX="ldbx";
|
||||
$MKX="addl";
|
||||
$ST="stb";
|
||||
} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
|
||||
$LD="ldw";
|
||||
$LDX="ldwx,s";
|
||||
$MKX="sh2addl";
|
||||
$ST="stw";
|
||||
}
|
||||
|
||||
$key="%r26";
|
||||
$len="%r25";
|
||||
$inp="%r24";
|
||||
$out="%r23";
|
||||
|
||||
@XX=("%r19","%r20");
|
||||
@TX=("%r21","%r22");
|
||||
$YY="%r28";
|
||||
$TY="%r29";
|
||||
|
||||
$acc="%r1";
|
||||
$ix="%r2";
|
||||
$iy="%r3";
|
||||
$dat0="%r4";
|
||||
$dat1="%r5";
|
||||
$rem="%r6";
|
||||
$mask="%r31";
|
||||
|
||||
sub unrolledloopbody {
|
||||
for ($i=0;$i<4;$i++) {
|
||||
$code.=<<___;
|
||||
ldo 1($XX[0]),$XX[1]
|
||||
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
|
||||
and $mask,$XX[1],$XX[1]
|
||||
$LDX $YY($key),$TY
|
||||
$MKX $YY,$key,$ix
|
||||
$LDX $XX[1]($key),$TX[1]
|
||||
$MKX $XX[0],$key,$iy
|
||||
$ST $TX[0],0($ix)
|
||||
comclr,<> $XX[1],$YY,%r0 ; conditional
|
||||
copy $TX[0],$TX[1] ; move
|
||||
`sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
|
||||
$ST $TY,0($iy)
|
||||
addl $TX[0],$TY,$TY
|
||||
addl $TX[1],$YY,$YY
|
||||
and $mask,$TY,$TY
|
||||
and $mask,$YY,$YY
|
||||
___
|
||||
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
|
||||
} }
|
||||
|
||||
sub foldedloop {
|
||||
my ($label,$count)=@_;
|
||||
$code.=<<___;
|
||||
$label
|
||||
$MKX $YY,$key,$iy
|
||||
$LDX $YY($key),$TY
|
||||
$MKX $XX[0],$key,$ix
|
||||
$ST $TX[0],0($iy)
|
||||
ldo 1($XX[0]),$XX[0]
|
||||
$ST $TY,0($ix)
|
||||
addl $TX[0],$TY,$TY
|
||||
ldbx $inp($out),$dat1
|
||||
and $mask,$TY,$TY
|
||||
and $mask,$XX[0],$XX[0]
|
||||
$LDX $TY($key),$acc
|
||||
$LDX $XX[0]($key),$TX[0]
|
||||
ldo 1($out),$out
|
||||
xor $dat1,$acc,$acc
|
||||
addl $TX[0],$YY,$YY
|
||||
stb $acc,-1($out)
|
||||
addib,<> -1,$count,$label ; $count is always small
|
||||
and $mask,$YY,$YY
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
|
||||
RC4
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
|
||||
cmpib,*= 0,$len,L\$abort
|
||||
sub $inp,$out,$inp ; distance between $inp and $out
|
||||
|
||||
$LD `0*$SZ`($key),$XX[0]
|
||||
$LD `1*$SZ`($key),$YY
|
||||
ldo `2*$SZ`($key),$key
|
||||
|
||||
ldi 0xff,$mask
|
||||
ldi 3,$dat0
|
||||
|
||||
ldo 1($XX[0]),$XX[0] ; warm up loop
|
||||
and $mask,$XX[0],$XX[0]
|
||||
$LDX $XX[0]($key),$TX[0]
|
||||
addl $TX[0],$YY,$YY
|
||||
cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
|
||||
and $mask,$YY,$YY
|
||||
|
||||
and,<> $out,$dat0,$rem ; is $out aligned?
|
||||
b L\$alignedout
|
||||
subi 4,$rem,$rem
|
||||
sub $len,$rem,$len
|
||||
___
|
||||
&foldedloop("L\$alignout",$rem); # process till $out is aligned
|
||||
|
||||
$code.=<<___;
|
||||
L\$alignedout ; $len is at least 4 here
|
||||
and,<> $inp,$dat0,$acc ; is $inp aligned?
|
||||
b L\$oop4
|
||||
sub $inp,$acc,$rem ; align $inp
|
||||
|
||||
sh3addl $acc,%r0,$acc
|
||||
subi 32,$acc,$acc
|
||||
mtctl $acc,%cr11 ; load %sar with vshd align factor
|
||||
ldwx $rem($out),$dat0
|
||||
ldo 4($rem),$rem
|
||||
L\$oop4misalignedinp
|
||||
___
|
||||
&unrolledloopbody();
|
||||
$code.=<<___;
|
||||
$LDX $TY($key),$ix
|
||||
ldwx $rem($out),$dat1
|
||||
ldo -4($len),$len
|
||||
or $ix,$acc,$acc ; last piece, no need to dep
|
||||
vshd $dat0,$dat1,$iy ; align data
|
||||
copy $dat1,$dat0
|
||||
xor $iy,$acc,$acc
|
||||
stw $acc,0($out)
|
||||
cmpib,*<< 3,$len,L\$oop4misalignedinp
|
||||
ldo 4($out),$out
|
||||
cmpib,*= 0,$len,L\$done
|
||||
nop
|
||||
b L\$oop1
|
||||
nop
|
||||
|
||||
.ALIGN 8
|
||||
L\$oop4
|
||||
___
|
||||
&unrolledloopbody();
|
||||
$code.=<<___;
|
||||
$LDX $TY($key),$ix
|
||||
ldwx $inp($out),$dat0
|
||||
ldo -4($len),$len
|
||||
or $ix,$acc,$acc ; last piece, no need to dep
|
||||
xor $dat0,$acc,$acc
|
||||
stw $acc,0($out)
|
||||
cmpib,*<< 3,$len,L\$oop4
|
||||
ldo 4($out),$out
|
||||
cmpib,*= 0,$len,L\$done
|
||||
nop
|
||||
___
|
||||
&foldedloop("L\$oop1",$len);
|
||||
$code.=<<___;
|
||||
L\$done
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2
|
||||
ldo -1($XX[0]),$XX[0] ; chill out loop
|
||||
sub $YY,$TX[0],$YY
|
||||
and $mask,$XX[0],$XX[0]
|
||||
and $mask,$YY,$YY
|
||||
$ST $XX[0],`-2*$SZ`($key)
|
||||
$ST $YY,`-1*$SZ`($key)
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
L\$abort
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
___
|
||||
|
||||
$code.=<<___;
|
||||
|
||||
.EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
||||
.ALIGN 8
|
||||
RC4_set_key
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
$ST %r0,`0*$SZ`($key)
|
||||
$ST %r0,`1*$SZ`($key)
|
||||
ldo `2*$SZ`($key),$key
|
||||
copy %r0,@XX[0]
|
||||
L\$1st
|
||||
$ST @XX[0],0($key)
|
||||
ldo 1(@XX[0]),@XX[0]
|
||||
bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
|
||||
ldo $SZ($key),$key
|
||||
|
||||
ldo `-256*$SZ`($key),$key ; rewind $key
|
||||
addl $len,$inp,$inp ; $inp to point at the end
|
||||
sub %r0,$len,%r23 ; inverse index
|
||||
copy %r0,@XX[0]
|
||||
copy %r0,@XX[1]
|
||||
ldi 0xff,$mask
|
||||
|
||||
L\$2nd
|
||||
$LDX @XX[0]($key),@TX[0]
|
||||
ldbx %r23($inp),@TX[1]
|
||||
addi,nuv 1,%r23,%r23 ; increment and conditional
|
||||
sub %r0,$len,%r23 ; inverse index
|
||||
addl @TX[0],@XX[1],@XX[1]
|
||||
addl @TX[1],@XX[1],@XX[1]
|
||||
and $mask,@XX[1],@XX[1]
|
||||
$MKX @XX[0],$key,$TY
|
||||
$LDX @XX[1]($key),@TX[1]
|
||||
$MKX @XX[1],$key,$YY
|
||||
ldo 1(@XX[0]),@XX[0]
|
||||
$ST @TX[0],0($YY)
|
||||
bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
|
||||
$ST @TX[1],0($TY)
|
||||
|
||||
bv,n (%r2)
|
||||
.EXIT
|
||||
nop
|
||||
.PROCEND
|
||||
|
||||
.EXPORT RC4_options,ENTRY
|
||||
.ALIGN 8
|
||||
RC4_options
|
||||
.PROC
|
||||
.CALLINFO NO_CALLS
|
||||
.ENTRY
|
||||
blr %r0,%r28
|
||||
ldi 3,%r1
|
||||
L\$pic
|
||||
andcm %r28,%r1,%r28
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
ldo L\$opts-L\$pic(%r28),%r28
|
||||
.PROCEND
|
||||
.ALIGN 8
|
||||
L\$opts
|
||||
.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
|
||||
.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
|
||||
|
||||
print $code;
|
||||
close STDOUT;
|
259
crypto/sha/asm/sha1-parisc.pl
Normal file
259
crypto/sha/asm/sha1-parisc.pl
Normal file
@ -0,0 +1,259 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# SHA1 block procedure for PA-RISC.
|
||||
|
||||
# June 2009.
|
||||
#
|
||||
# On PA-7100LC performance is >30% better than gcc 3.2 generated code
|
||||
# for aligned input and >50% better for unaligned. Compared to vendor
|
||||
# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
|
||||
# few percent faster in 32-bit one (this for aligned input, data for
|
||||
# unaligned input is not available).
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
} else {
|
||||
$LEVEL ="1.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
}
|
||||
|
||||
$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$ctx="%r26"; # arg0
|
||||
$inp="%r25"; # arg1
|
||||
$num="%r24"; # arg2
|
||||
|
||||
$t0="%r28";
|
||||
$t1="%r29";
|
||||
$K="%r31";
|
||||
|
||||
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
|
||||
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
|
||||
|
||||
@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
|
||||
|
||||
sub BODY_00_19 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<15);
|
||||
addl $K,$e,$e ; $i
|
||||
shd $a,$a,27,$t1
|
||||
addl @X[$i],$e,$e
|
||||
and $c,$b,$t0
|
||||
addl $t1,$e,$e
|
||||
andcm $d,$b,$t1
|
||||
shd $b,$b,2,$b
|
||||
or $t1,$t0,$t0
|
||||
addl $t0,$e,$e
|
||||
___
|
||||
$code.=<<___ if ($i>=15); # with forward Xupdate
|
||||
addl $K,$e,$e ; $i
|
||||
shd $a,$a,27,$t1
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
addl @X[$i%16],$e,$e
|
||||
and $c,$b,$t0
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
addl $t1,$e,$e
|
||||
andcm $d,$b,$t1
|
||||
shd $b,$b,2,$b
|
||||
or $t1,$t0,$t0
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
add $t0,$e,$e
|
||||
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_20_39 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___ if ($i<79);
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
|
||||
addl $K,$e,$e
|
||||
shd $a,$a,27,$t1
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
addl @X[$i%16],$e,$e
|
||||
xor $b,$c,$t0
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
addl $t1,$e,$e
|
||||
shd $b,$b,2,$b
|
||||
xor $d,$t0,$t0
|
||||
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
|
||||
addl $t0,$e,$e
|
||||
___
|
||||
$code.=<<___ if ($i==79); # with context load
|
||||
ldw 0($ctx),@X[0] ; $i
|
||||
addl $K,$e,$e
|
||||
shd $a,$a,27,$t1
|
||||
ldw 4($ctx),@X[1]
|
||||
addl @X[$i%16],$e,$e
|
||||
xor $b,$c,$t0
|
||||
ldw 8($ctx),@X[2]
|
||||
addl $t1,$e,$e
|
||||
shd $b,$b,2,$b
|
||||
xor $d,$t0,$t0
|
||||
ldw 12($ctx),@X[3]
|
||||
addl $t0,$e,$e
|
||||
ldw 16($ctx),@X[4]
|
||||
___
|
||||
}
|
||||
|
||||
sub BODY_40_59 {
|
||||
my ($i,$a,$b,$c,$d,$e)=@_;
|
||||
my $j=$i+1;
|
||||
$code.=<<___;
|
||||
shd $a,$a,27,$t1 ; $i
|
||||
addl $K,$e,$e
|
||||
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
|
||||
xor $d,$c,$t0
|
||||
addl @X[$i%16],$e,$e
|
||||
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
|
||||
and $b,$t0,$t0
|
||||
addl $t1,$e,$e
|
||||
shd $b,$b,2,$b
|
||||
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
|
||||
addl $t0,$e,$e
|
||||
and $d,$c,$t1
|
||||
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
|
||||
addl $t1,$e,$e
|
||||
___
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
||||
sha1_block_data_order
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
|
||||
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
|
||||
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
|
||||
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
|
||||
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
|
||||
|
||||
ldw 0($ctx),$A
|
||||
ldw 4($ctx),$B
|
||||
ldw 8($ctx),$C
|
||||
ldw 12($ctx),$D
|
||||
ldw 16($ctx),$E
|
||||
|
||||
extru $inp,31,2,$t0 ; t0=inp&3;
|
||||
sh3addl $t0,%r0,$t0 ; t0*=8;
|
||||
subi 32,$t0,$t0 ; t0=32-t0;
|
||||
mtctl $t0,%cr11 ; %sar=t0;
|
||||
|
||||
L\$oop
|
||||
ldi 3,$t0
|
||||
andcm $inp,$t0,$t0 ; 64-bit neutral
|
||||
___
|
||||
for ($i=0;$i<15;$i++) { # load input block
|
||||
$code.="\tldw `4*$i`($t0),@X[$i]\n"; }
|
||||
$code.=<<___;
|
||||
cmpb,*= $inp,$t0,L\$aligned
|
||||
ldw 60($t0),@X[15]
|
||||
ldw 64($t0),@X[16]
|
||||
___
|
||||
for ($i=0;$i<16;$i++) { # align input
|
||||
$code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
|
||||
$code.=<<___;
|
||||
L\$aligned
|
||||
ldil L'0x5a827000,$K ; K_00_19
|
||||
ldo 0x999($K),$K
|
||||
___
|
||||
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
ldil L'0x6ed9e000,$K ; K_20_39
|
||||
ldo 0xba1($K),$K
|
||||
___
|
||||
|
||||
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
ldil L'0x8f1bb000,$K ; K_40_59
|
||||
ldo 0xcdc($K),$K
|
||||
___
|
||||
|
||||
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
ldil L'0xca62c000,$K ; K_60_79
|
||||
ldo 0x1d6($K),$K
|
||||
___
|
||||
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
addl @X[0],$A,$A
|
||||
addl @X[1],$B,$B
|
||||
addl @X[2],$C,$C
|
||||
addl @X[3],$D,$D
|
||||
addl @X[4],$E,$E
|
||||
stw $A,0($ctx)
|
||||
stw $B,4($ctx)
|
||||
stw $C,8($ctx)
|
||||
stw $D,12($ctx)
|
||||
stw $E,16($ctx)
|
||||
addib,*<> -1,$num,L\$oop
|
||||
ldo 64($inp),$inp
|
||||
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
|
||||
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
|
||||
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
|
||||
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
|
||||
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/,\*/,/gm if ($SIZE_T==4);
|
||||
print $code;
|
||||
close STDOUT;
|
791
crypto/sha/asm/sha512-parisc.pl
Executable file
791
crypto/sha/asm/sha512-parisc.pl
Executable file
@ -0,0 +1,791 @@
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# ====================================================================
|
||||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||||
# project. The module is, however, dual licensed under OpenSSL and
|
||||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||||
# ====================================================================
|
||||
|
||||
# SHA256/512 block procedure for PA-RISC.
|
||||
|
||||
# June 2009.
|
||||
#
|
||||
# SHA256 performance is >75% better than gcc 3.2 generated code on
|
||||
# PA-7100LC. Compared to code generated by vendor compiler this
|
||||
# implementation is almost 70% faster in 64-bit build, but delivers
|
||||
# virtually same performance in 32-bit build on PA-8600.
|
||||
#
|
||||
# SHA512 performance is >2.9x better than gcc 3.2 generated code on
|
||||
# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
|
||||
# code is executed on PA-RISC 2.0 processor and switches to 64-bit
|
||||
# code path delivering adequate peformance even in "blended" 32-bit
|
||||
# build. Though 64-bit code is not any faster than code generated by
|
||||
# vendor compiler on PA-8600...
|
||||
#
|
||||
# Special thanks to polarhome.com for providing HP-UX account.
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
open STDOUT,">$output";
|
||||
|
||||
if ($flavour =~ /64/) {
|
||||
$LEVEL ="2.0W";
|
||||
$SIZE_T =8;
|
||||
$FRAME_MARKER =80;
|
||||
$SAVED_RP =16;
|
||||
$PUSH ="std";
|
||||
$PUSHMA ="std,ma";
|
||||
$POP ="ldd";
|
||||
$POPMB ="ldd,mb";
|
||||
} else {
|
||||
$LEVEL ="1.0";
|
||||
$SIZE_T =4;
|
||||
$FRAME_MARKER =48;
|
||||
$SAVED_RP =20;
|
||||
$PUSH ="stw";
|
||||
$PUSHMA ="stwm";
|
||||
$POP ="ldw";
|
||||
$POPMB ="ldwm";
|
||||
}
|
||||
|
||||
if ($output =~ /512/) {
|
||||
$func="sha512_block_data_order";
|
||||
$SZ=8;
|
||||
@Sigma0=(28,34,39);
|
||||
@Sigma1=(14,18,41);
|
||||
@sigma0=(1, 8, 7);
|
||||
@sigma1=(19,61, 6);
|
||||
$rounds=80;
|
||||
$LAST10BITS=0x017;
|
||||
$LD="ldd";
|
||||
$LDM="ldd,ma";
|
||||
$ST="std";
|
||||
} else {
|
||||
$func="sha256_block_data_order";
|
||||
$SZ=4;
|
||||
@Sigma0=( 2,13,22);
|
||||
@Sigma1=( 6,11,25);
|
||||
@sigma0=( 7,18, 3);
|
||||
@sigma1=(17,19,10);
|
||||
$rounds=64;
|
||||
$LAST10BITS=0x0f2;
|
||||
$LD="ldw";
|
||||
$LDM="ldwm";
|
||||
$ST="stw";
|
||||
}
|
||||
|
||||
$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
|
||||
# [+ argument transfer]
|
||||
$XOFF=16*$SZ+32; # local variables
|
||||
$FRAME+=$XOFF;
|
||||
$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
|
||||
|
||||
$ctx="%r26"; # zapped by $a0
|
||||
$inp="%r25"; # zapped by $a1
|
||||
$num="%r24"; # zapped by $t0
|
||||
|
||||
$a0 ="%r26";
|
||||
$a1 ="%r25";
|
||||
$t0 ="%r24";
|
||||
$t1 ="%r29";
|
||||
$Tbl="%r31";
|
||||
|
||||
@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
|
||||
|
||||
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
|
||||
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
|
||||
|
||||
sub ROUND_00_15 {
|
||||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
|
||||
$code.=<<___;
|
||||
_ror $e,$Sigma1[0],$a0
|
||||
and $f,$e,$t0
|
||||
_ror $e,$Sigma1[1],$a1
|
||||
addl $t1,$h,$h
|
||||
andcm $g,$e,$t1
|
||||
xor $a1,$a0,$a0
|
||||
_ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
|
||||
or $t0,$t1,$t1 ; Ch(e,f,g)
|
||||
addl @X[$i%16],$h,$h
|
||||
xor $a0,$a1,$a1 ; Sigma1(e)
|
||||
addl $t1,$h,$h
|
||||
_ror $a,$Sigma0[0],$a0
|
||||
addl $a1,$h,$h
|
||||
|
||||
_ror $a,$Sigma0[1],$a1
|
||||
and $a,$b,$t0
|
||||
and $a,$c,$t1
|
||||
xor $a1,$a0,$a0
|
||||
_ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
|
||||
xor $t1,$t0,$t0
|
||||
and $b,$c,$t1
|
||||
xor $a0,$a1,$a1 ; Sigma0(a)
|
||||
addl $h,$d,$d
|
||||
xor $t1,$t0,$t0 ; Maj(a,b,c)
|
||||
`"$LDM $SZ($Tbl),$t1" if ($i<15)`
|
||||
addl $a1,$h,$h
|
||||
addl $t0,$h,$h
|
||||
|
||||
___
|
||||
}
|
||||
|
||||
sub ROUND_16_xx {
|
||||
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
|
||||
$i-=16;
|
||||
$code.=<<___;
|
||||
_ror @X[($i+1)%16],$sigma0[0],$a0
|
||||
_ror @X[($i+1)%16],$sigma0[1],$a1
|
||||
addl @X[($i+9)%16],@X[$i],@X[$i]
|
||||
_ror @X[($i+14)%16],$sigma1[0],$t0
|
||||
_ror @X[($i+14)%16],$sigma1[1],$t1
|
||||
xor $a1,$a0,$a0
|
||||
_shr @X[($i+1)%16],$sigma0[2],$a1
|
||||
xor $t1,$t0,$t0
|
||||
_shr @X[($i+14)%16],$sigma1[2],$t1
|
||||
xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
|
||||
xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
|
||||
$LDM $SZ($Tbl),$t1
|
||||
addl $a0,@X[$i],@X[$i]
|
||||
addl $t0,@X[$i],@X[$i]
|
||||
___
|
||||
$code.=<<___ if ($i==15);
|
||||
extru $t1,31,10,$a1
|
||||
comiclr,<> $LAST10BITS,$a1,%r0
|
||||
ldo 1($Tbl),$Tbl ; signal end of $Tbl
|
||||
___
|
||||
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
|
||||
}
|
||||
|
||||
$code=<<___;
|
||||
.LEVEL $LEVEL
|
||||
.SPACE \$TEXT\$
|
||||
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
|
||||
|
||||
.ALIGN 64
|
||||
L\$table
|
||||
___
|
||||
$code.=<<___ if ($SZ==8);
|
||||
.WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
|
||||
.WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
|
||||
.WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
|
||||
.WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
|
||||
.WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
|
||||
.WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
|
||||
.WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
|
||||
.WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
|
||||
.WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
|
||||
.WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
|
||||
.WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
|
||||
.WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
|
||||
.WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
|
||||
.WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
|
||||
.WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
|
||||
.WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
|
||||
.WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
|
||||
.WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
|
||||
.WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
|
||||
.WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
|
||||
.WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
|
||||
.WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
|
||||
.WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
|
||||
.WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
|
||||
.WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
|
||||
.WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
|
||||
.WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
|
||||
.WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
|
||||
.WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
|
||||
.WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
|
||||
.WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
|
||||
.WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
|
||||
.WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
|
||||
.WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
|
||||
.WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
|
||||
.WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
|
||||
.WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
|
||||
.WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
|
||||
.WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
|
||||
.WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
|
||||
___
|
||||
$code.=<<___ if ($SZ==4);
|
||||
.WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
|
||||
.WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
|
||||
.WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
|
||||
.WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
|
||||
.WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
|
||||
.WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
|
||||
.WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
|
||||
.WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
|
||||
.WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
|
||||
.WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
|
||||
.WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
|
||||
.WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
|
||||
.WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
|
||||
.WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
|
||||
.WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
|
||||
.WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
|
||||
___
|
||||
$code.=<<___;
|
||||
|
||||
.EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
|
||||
.ALIGN 64
|
||||
$func
|
||||
.PROC
|
||||
.CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
|
||||
.ENTRY
|
||||
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
|
||||
$PUSHMA %r3,$FRAME(%sp)
|
||||
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
|
||||
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
|
||||
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
|
||||
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
|
||||
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
|
||||
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
|
||||
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
|
||||
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
|
||||
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
|
||||
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
|
||||
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
|
||||
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
|
||||
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
|
||||
$PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
|
||||
$PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
|
||||
|
||||
_shl $num,`log(16*$SZ)/log(2)`,$num
|
||||
addl $inp,$num,$num ; $num to point at the end of $inp
|
||||
|
||||
$PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
|
||||
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
|
||||
$PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
|
||||
|
||||
blr %r0,$Tbl
|
||||
ldi 3,$t1
|
||||
L\$pic
|
||||
andcm $Tbl,$t1,$Tbl ; wipe privilege level
|
||||
ldo L\$table-L\$pic($Tbl),$Tbl
|
||||
___
|
||||
$code.=<<___ if ($SZ==8 && $SIZE_T==4);
|
||||
ldi 31,$t1
|
||||
mtctl $t1,%cr11
|
||||
extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
|
||||
b L\$parisc1
|
||||
nop
|
||||
___
|
||||
$code.=<<___;
|
||||
$LD `0*$SZ`($ctx),$A ; load context
|
||||
$LD `1*$SZ`($ctx),$B
|
||||
$LD `2*$SZ`($ctx),$C
|
||||
$LD `3*$SZ`($ctx),$D
|
||||
$LD `4*$SZ`($ctx),$E
|
||||
$LD `5*$SZ`($ctx),$F
|
||||
$LD `6*$SZ`($ctx),$G
|
||||
$LD `7*$SZ`($ctx),$H
|
||||
|
||||
extru $inp,31,`log($SZ)/log(2)`,$t0
|
||||
sh3addl $t0,%r0,$t0
|
||||
subi `8*$SZ`,$t0,$t0
|
||||
mtctl $t0,%cr11 ; load %sar with align factor
|
||||
|
||||
L\$oop
|
||||
ldi `$SZ-1`,$t0
|
||||
$LDM $SZ($Tbl),$t1
|
||||
andcm $inp,$t0,$t0 ; align $inp
|
||||
___
|
||||
for ($i=0;$i<15;$i++) { # load input block
|
||||
$code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
|
||||
$code.=<<___;
|
||||
cmpb,*= $inp,$t0,L\$aligned
|
||||
$LD `$SZ*15`($t0),@X[15]
|
||||
$LD `$SZ*16`($t0),@X[16]
|
||||
___
|
||||
for ($i=0;$i<16;$i++) { # align data
|
||||
$code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
|
||||
$code.=<<___;
|
||||
L\$aligned
|
||||
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
|
||||
___
|
||||
|
||||
for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
L\$rounds
|
||||
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
|
||||
___
|
||||
for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
|
||||
$code.=<<___;
|
||||
bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
|
||||
nop
|
||||
|
||||
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
|
||||
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
|
||||
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
|
||||
ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
|
||||
|
||||
$LD `0*$SZ`($ctx),@X[0] ; load context
|
||||
$LD `1*$SZ`($ctx),@X[1]
|
||||
$LD `2*$SZ`($ctx),@X[2]
|
||||
$LD `3*$SZ`($ctx),@X[3]
|
||||
$LD `4*$SZ`($ctx),@X[4]
|
||||
$LD `5*$SZ`($ctx),@X[5]
|
||||
addl @X[0],$A,$A
|
||||
$LD `6*$SZ`($ctx),@X[6]
|
||||
addl @X[1],$B,$B
|
||||
$LD `7*$SZ`($ctx),@X[7]
|
||||
ldo `16*$SZ`($inp),$inp ; advance $inp
|
||||
|
||||
$ST $A,`0*$SZ`($ctx) ; save context
|
||||
addl @X[2],$C,$C
|
||||
$ST $B,`1*$SZ`($ctx)
|
||||
addl @X[3],$D,$D
|
||||
$ST $C,`2*$SZ`($ctx)
|
||||
addl @X[4],$E,$E
|
||||
$ST $D,`3*$SZ`($ctx)
|
||||
addl @X[5],$F,$F
|
||||
$ST $E,`4*$SZ`($ctx)
|
||||
addl @X[6],$G,$G
|
||||
$ST $F,`5*$SZ`($ctx)
|
||||
addl @X[7],$H,$H
|
||||
$ST $G,`6*$SZ`($ctx)
|
||||
$ST $H,`7*$SZ`($ctx)
|
||||
|
||||
cmpb,*<>,n $inp,$num,L\$oop
|
||||
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
|
||||
___
|
||||
if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
|
||||
{{
|
||||
$code.=<<___;
|
||||
b L\$done
|
||||
nop
|
||||
|
||||
.ALIGN 64
|
||||
L\$parisc1
|
||||
___
|
||||
|
||||
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
|
||||
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
|
||||
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
|
||||
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
|
||||
$a0 ="%r17";
|
||||
$a1 ="%r18";
|
||||
$a2 ="%r19";
|
||||
$a3 ="%r20";
|
||||
$t0 ="%r21";
|
||||
$t1 ="%r22";
|
||||
$t2 ="%r28";
|
||||
$t3 ="%r29";
|
||||
$Tbl="%r31";
|
||||
|
||||
@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
|
||||
|
||||
sub ROUND_00_15_pa1 {
|
||||
my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
|
||||
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
|
||||
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
|
||||
|
||||
$code.=<<___ if (!$flag);
|
||||
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
|
||||
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
|
||||
___
|
||||
$code.=<<___;
|
||||
shd $ehi,$elo,$Sigma1[0],$t0
|
||||
add $Xlo,$hlo,$hlo
|
||||
shd $elo,$ehi,$Sigma1[0],$t1
|
||||
addc $Xhi,$hhi,$hhi ; h += X[i]
|
||||
shd $ehi,$elo,$Sigma1[1],$t2
|
||||
ldwm 8($Tbl),$Xhi
|
||||
shd $elo,$ehi,$Sigma1[1],$t3
|
||||
ldw -4($Tbl),$Xlo ; load K[i]
|
||||
xor $t2,$t0,$t0
|
||||
xor $t3,$t1,$t1
|
||||
and $flo,$elo,$a0
|
||||
and $fhi,$ehi,$a1
|
||||
shd $ehi,$elo,$Sigma1[2],$t2
|
||||
andcm $glo,$elo,$a2
|
||||
shd $elo,$ehi,$Sigma1[2],$t3
|
||||
andcm $ghi,$ehi,$a3
|
||||
xor $t2,$t0,$t0
|
||||
xor $t3,$t1,$t1 ; Sigma1(e)
|
||||
add $Xlo,$hlo,$hlo
|
||||
xor $a2,$a0,$a0
|
||||
addc $Xhi,$hhi,$hhi ; h += K[i]
|
||||
xor $a3,$a1,$a1 ; Ch(e,f,g)
|
||||
|
||||
add $t0,$hlo,$hlo
|
||||
shd $ahi,$alo,$Sigma0[0],$t0
|
||||
addc $t1,$hhi,$hhi ; h += Sigma1(e)
|
||||
shd $alo,$ahi,$Sigma0[0],$t1
|
||||
add $a0,$hlo,$hlo
|
||||
shd $ahi,$alo,$Sigma0[1],$t2
|
||||
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
|
||||
shd $alo,$ahi,$Sigma0[1],$t3
|
||||
|
||||
xor $t2,$t0,$t0
|
||||
xor $t3,$t1,$t1
|
||||
shd $ahi,$alo,$Sigma0[2],$t2
|
||||
and $alo,$blo,$a0
|
||||
shd $alo,$ahi,$Sigma0[2],$t3
|
||||
and $ahi,$bhi,$a1
|
||||
xor $t2,$t0,$t0
|
||||
xor $t3,$t1,$t1 ; Sigma0(a)
|
||||
|
||||
and $alo,$clo,$a2
|
||||
and $ahi,$chi,$a3
|
||||
xor $a2,$a0,$a0
|
||||
add $hlo,$dlo,$dlo
|
||||
xor $a3,$a1,$a1
|
||||
addc $hhi,$dhi,$dhi ; d += h
|
||||
and $blo,$clo,$a2
|
||||
add $t0,$hlo,$hlo
|
||||
and $bhi,$chi,$a3
|
||||
addc $t1,$hhi,$hhi ; h += Sigma0(a)
|
||||
xor $a2,$a0,$a0
|
||||
add $a0,$hlo,$hlo
|
||||
xor $a3,$a1,$a1 ; Maj(a,b,c)
|
||||
addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
|
||||
|
||||
___
|
||||
$code.=<<___ if ($i==15 && $flag);
|
||||
extru $Xlo,31,10,$Xlo
|
||||
comiclr,= $LAST10BITS,$Xlo,%r0
|
||||
b L\$rounds_pa1
|
||||
nop
|
||||
___
|
||||
push(@X,shift(@X)); push(@X,shift(@X));
|
||||
}
|
||||
|
||||
sub ROUND_16_xx_pa1 {
|
||||
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
|
||||
my ($i)=shift;
|
||||
$i-=16;
|
||||
$code.=<<___;
|
||||
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
|
||||
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
|
||||
ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
|
||||
ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
|
||||
ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
|
||||
ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
|
||||
shd $Xnhi,$Xnlo,$sigma0[0],$t0
|
||||
shd $Xnlo,$Xnhi,$sigma0[0],$t1
|
||||
add $a0,$Xlo,$Xlo
|
||||
shd $Xnhi,$Xnlo,$sigma0[1],$t2
|
||||
addc $a1,$Xhi,$Xhi
|
||||
shd $Xnlo,$Xnhi,$sigma0[1],$t3
|
||||
xor $t2,$t0,$t0
|
||||
shd $Xnhi,$Xnlo,$sigma0[2],$t2
|
||||
xor $t3,$t1,$t1
|
||||
extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
|
||||
xor $t2,$t0,$t0
|
||||
shd $a3,$a2,$sigma1[0],$a0
|
||||
xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
|
||||
shd $a2,$a3,$sigma1[0],$a1
|
||||
add $t0,$Xlo,$Xlo
|
||||
shd $a3,$a2,$sigma1[1],$t2
|
||||
addc $t1,$Xhi,$Xhi
|
||||
shd $a2,$a3,$sigma1[1],$t3
|
||||
xor $t2,$a0,$a0
|
||||
shd $a3,$a2,$sigma1[2],$t2
|
||||
xor $t3,$a1,$a1
|
||||
extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
|
||||
xor $t2,$a0,$a0
|
||||
xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
|
||||
add $a0,$Xlo,$Xlo
|
||||
addc $a1,$Xhi,$Xhi
|
||||
|
||||
stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
|
||||
stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
|
||||
___
|
||||
&ROUND_00_15_pa1($i,@_,1);
|
||||
}
|
||||
$code.=<<___;
|
||||
ldw `0*4`($ctx),$Ahi ; load context
|
||||
ldw `1*4`($ctx),$Alo
|
||||
ldw `2*4`($ctx),$Bhi
|
||||
ldw `3*4`($ctx),$Blo
|
||||
ldw `4*4`($ctx),$Chi
|
||||
ldw `5*4`($ctx),$Clo
|
||||
ldw `6*4`($ctx),$Dhi
|
||||
ldw `7*4`($ctx),$Dlo
|
||||
ldw `8*4`($ctx),$Ehi
|
||||
ldw `9*4`($ctx),$Elo
|
||||
ldw `10*4`($ctx),$Fhi
|
||||
ldw `11*4`($ctx),$Flo
|
||||
ldw `12*4`($ctx),$Ghi
|
||||
ldw `13*4`($ctx),$Glo
|
||||
ldw `14*4`($ctx),$Hhi
|
||||
ldw `15*4`($ctx),$Hlo
|
||||
|
||||
extru $inp,31,2,$t0
|
||||
sh3addl $t0,%r0,$t0
|
||||
subi 32,$t0,$t0
|
||||
mtctl $t0,%cr11 ; load %sar with align factor
|
||||
|
||||
L\$oop_pa1
|
||||
extru $inp,31,2,$a3
|
||||
comib,= 0,$a3,L\$aligned_pa1
|
||||
sub $inp,$a3,$inp
|
||||
|
||||
ldw `0*4`($inp),$X[0]
|
||||
ldw `1*4`($inp),$X[1]
|
||||
ldw `2*4`($inp),$t2
|
||||
ldw `3*4`($inp),$t3
|
||||
ldw `4*4`($inp),$a0
|
||||
ldw `5*4`($inp),$a1
|
||||
ldw `6*4`($inp),$a2
|
||||
ldw `7*4`($inp),$a3
|
||||
vshd $X[0],$X[1],$X[0]
|
||||
vshd $X[1],$t2,$X[1]
|
||||
stw $X[0],`-$XOFF+0*4`(%sp)
|
||||
ldw `8*4`($inp),$t0
|
||||
vshd $t2,$t3,$t2
|
||||
stw $X[1],`-$XOFF+1*4`(%sp)
|
||||
ldw `9*4`($inp),$t1
|
||||
vshd $t3,$a0,$t3
|
||||
___
|
||||
{
|
||||
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
|
||||
for ($i=2;$i<=(128/4-8);$i++) {
|
||||
$code.=<<___;
|
||||
stw $t[0],`-$XOFF+$i*4`(%sp)
|
||||
ldw `(8+$i)*4`($inp),$t[0]
|
||||
vshd $t[1],$t[2],$t[1]
|
||||
___
|
||||
push(@t,shift(@t));
|
||||
}
|
||||
for (;$i<(128/4-1);$i++) {
|
||||
$code.=<<___;
|
||||
stw $t[0],`-$XOFF+$i*4`(%sp)
|
||||
vshd $t[1],$t[2],$t[1]
|
||||
___
|
||||
push(@t,shift(@t));
|
||||
}
|
||||
$code.=<<___;
|
||||
b L\$collected_pa1
|
||||
stw $t[0],`-$XOFF+$i*4`(%sp)
|
||||
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
L\$aligned_pa1
|
||||
ldw `0*4`($inp),$X[0]
|
||||
ldw `1*4`($inp),$X[1]
|
||||
ldw `2*4`($inp),$t2
|
||||
ldw `3*4`($inp),$t3
|
||||
ldw `4*4`($inp),$a0
|
||||
ldw `5*4`($inp),$a1
|
||||
ldw `6*4`($inp),$a2
|
||||
ldw `7*4`($inp),$a3
|
||||
stw $X[0],`-$XOFF+0*4`(%sp)
|
||||
ldw `8*4`($inp),$t0
|
||||
stw $X[1],`-$XOFF+1*4`(%sp)
|
||||
ldw `9*4`($inp),$t1
|
||||
___
|
||||
{
|
||||
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
|
||||
for ($i=2;$i<(128/4-8);$i++) {
|
||||
$code.=<<___;
|
||||
stw $t[0],`-$XOFF+$i*4`(%sp)
|
||||
ldw `(8+$i)*4`($inp),$t[0]
|
||||
___
|
||||
push(@t,shift(@t));
|
||||
}
|
||||
for (;$i<128/4;$i++) {
|
||||
$code.=<<___;
|
||||
stw $t[0],`-$XOFF+$i*4`(%sp)
|
||||
___
|
||||
push(@t,shift(@t));
|
||||
}
|
||||
$code.="L\$collected_pa1\n";
|
||||
}
|
||||
|
||||
for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
|
||||
$code.="L\$rounds_pa1\n";
|
||||
for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
|
||||
|
||||
$code.=<<___;
|
||||
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
|
||||
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
|
||||
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
|
||||
ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
|
||||
|
||||
ldw `0*4`($ctx),$t1 ; update context
|
||||
ldw `1*4`($ctx),$t0
|
||||
ldw `2*4`($ctx),$t3
|
||||
ldw `3*4`($ctx),$t2
|
||||
ldw `4*4`($ctx),$a1
|
||||
ldw `5*4`($ctx),$a0
|
||||
ldw `6*4`($ctx),$a3
|
||||
add $t0,$Alo,$Alo
|
||||
ldw `7*4`($ctx),$a2
|
||||
addc $t1,$Ahi,$Ahi
|
||||
ldw `8*4`($ctx),$t1
|
||||
add $t2,$Blo,$Blo
|
||||
ldw `9*4`($ctx),$t0
|
||||
addc $t3,$Bhi,$Bhi
|
||||
ldw `10*4`($ctx),$t3
|
||||
add $a0,$Clo,$Clo
|
||||
ldw `11*4`($ctx),$t2
|
||||
addc $a1,$Chi,$Chi
|
||||
ldw `12*4`($ctx),$a1
|
||||
add $a2,$Dlo,$Dlo
|
||||
ldw `13*4`($ctx),$a0
|
||||
addc $a3,$Dhi,$Dhi
|
||||
ldw `14*4`($ctx),$a3
|
||||
add $t0,$Elo,$Elo
|
||||
ldw `15*4`($ctx),$a2
|
||||
addc $t1,$Ehi,$Ehi
|
||||
stw $Ahi,`0*4`($ctx)
|
||||
add $t2,$Flo,$Flo
|
||||
stw $Alo,`1*4`($ctx)
|
||||
addc $t3,$Fhi,$Fhi
|
||||
stw $Bhi,`2*4`($ctx)
|
||||
add $a0,$Glo,$Glo
|
||||
stw $Blo,`3*4`($ctx)
|
||||
addc $a1,$Ghi,$Ghi
|
||||
stw $Chi,`4*4`($ctx)
|
||||
add $a2,$Hlo,$Hlo
|
||||
stw $Clo,`5*4`($ctx)
|
||||
addc $a3,$Hhi,$Hhi
|
||||
stw $Dhi,`6*4`($ctx)
|
||||
ldo `16*$SZ`($inp),$inp ; advance $inp
|
||||
stw $Dlo,`7*4`($ctx)
|
||||
stw $Ehi,`8*4`($ctx)
|
||||
stw $Elo,`9*4`($ctx)
|
||||
stw $Fhi,`10*4`($ctx)
|
||||
stw $Flo,`11*4`($ctx)
|
||||
stw $Ghi,`12*4`($ctx)
|
||||
stw $Glo,`13*4`($ctx)
|
||||
stw $Hhi,`14*4`($ctx)
|
||||
comb,= $inp,$num,L\$done
|
||||
stw $Hlo,`15*4`($ctx)
|
||||
b L\$oop_pa1
|
||||
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
|
||||
L\$done
|
||||
___
|
||||
}}
|
||||
$code.=<<___;
|
||||
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
|
||||
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
|
||||
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
|
||||
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
|
||||
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
|
||||
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
|
||||
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
|
||||
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
|
||||
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
|
||||
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
|
||||
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
|
||||
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
|
||||
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
|
||||
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
|
||||
$POP `-$FRAME+14*$SIZE_T`(%sp),%r17
|
||||
$POP `-$FRAME+15*$SIZE_T`(%sp),%r18
|
||||
bv (%r2)
|
||||
.EXIT
|
||||
$POPMB -$FRAME(%sp),%r3
|
||||
.PROCEND
|
||||
.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
___
|
||||
|
||||
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
|
||||
# that it can be compiled with .LEVEL 1.0. It should be noted that I
|
||||
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
|
||||
# directive...
|
||||
|
||||
my $ldd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "ldd$mod\t$args";
|
||||
|
||||
if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
|
||||
{ my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
|
||||
$opcode|=(1<<3) if ($mod =~ /^,m/);
|
||||
$opcode|=(1<<2) if ($mod =~ /^,mb/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $std = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "std$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
|
||||
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $extrd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "extrd$mod\t$args";
|
||||
|
||||
# I only have ",u" completer, it's implicitly encoded...
|
||||
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
|
||||
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
|
||||
my $len=32-$3;
|
||||
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
|
||||
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
|
||||
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
|
||||
my $len=32-$2;
|
||||
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
|
||||
$opcode |= (1<<13) if ($mod =~ /,\**=/);
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
my $shrpd = sub {
|
||||
my ($mod,$args) = @_;
|
||||
my $orig = "shrpd$mod\t$args";
|
||||
|
||||
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
|
||||
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
|
||||
my $cpos=63-$3;
|
||||
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
|
||||
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
|
||||
}
|
||||
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
|
||||
{ sprintf "\t.WORD\t0x%08x\t; %s",
|
||||
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
|
||||
}
|
||||
else { "\t".$orig; }
|
||||
};
|
||||
|
||||
sub assemble {
|
||||
my ($mnemonic,$mod,$args)=@_;
|
||||
my $opcode = eval("\$$mnemonic");
|
||||
|
||||
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
|
||||
$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
|
||||
: sprintf("shd\t%$1,%$2,%d",$3)/e or
|
||||
# translate made up instructons: _ror, _shr, _align, _shl
|
||||
s/_ror(\s+)(%r[0-9]+),/
|
||||
($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
|
||||
|
||||
s/_shr(\s+%r[0-9]+),([0-9]+),/
|
||||
$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
|
||||
: sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
|
||||
|
||||
s/_align(\s+%r[0-9]+,%r[0-9]+),/
|
||||
($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
|
||||
|
||||
s/_shl(\s+%r[0-9]+),([0-9]+),/
|
||||
$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
|
||||
: sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
|
||||
|
||||
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
|
||||
|
||||
s/cmpb,\*/comb,/ if ($SIZE_T==4);
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
||||
close STDOUT;
|
Loading…
x
Reference in New Issue
Block a user