PA-RISC assembler pack update from HEAD.

This commit is contained in:
Andy Polyakov 2011-11-14 20:50:15 +00:00
parent 5d9bb428bb
commit 1a111921da
6 changed files with 3601 additions and 0 deletions

1021
crypto/aes/asm/aes-parisc.pl Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,993 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# On PA-7100LC this module performs ~90-50% better, less for longer
# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means
# that compiler utilized xmpyu instruction to perform 32x32=64-bit
# multiplication, which in turn means that "baseline" performance was
# optimal in respect to instruction set capabilities. Fair comparison
# with vendor compiler is problematic, because OpenSSL doesn't define
# BN_LLONG [presumably] for historical reasons, which drives compiler
# toward 4 times 16x16=32-bit multiplicatons [plus complementary
# shifts and additions] instead. This means that you should observe
# several times improvement over code generated by vendor compiler
# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual
# improvement coefficient was never collected on PA-7100LC, or any
# other 1.1 CPU, because I don't have access to such machine with
# vendor compiler. But to give you a taste, PA-RISC 1.1 code path
# reportedly outperformed code generated by cc +DA1.1 +O3 by factor
# of ~5x on PA-8600.
#
# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is
# reportedly ~2x faster than vendor compiler generated code [according
# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of
# this implementation is actually 32-bit one, in the sense that it
# operates on 32-bit values. But pa-risc2[W].s operates on arrays of
# 64-bit BN_LONGs... How do they interoperate then? No problem. This
# module picks halves of 64-bit values in reverse order and pretends
# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure"
# 64-bit code such as pa-risc2[W].s then? Well, the thing is that
# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do,
# i.e. there is no "wider" multiplication like on most other 64-bit
# platforms. This means that even being effectively 32-bit, this
# implementation performs "64-bit" computational task in same amount
# of arithmetic operations, most notably multiplications. It requires
# more memory references, most notably to tp[num], but this doesn't
# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC
# 2.0 code path, provides virtually same performance as pa-risc2[W].s:
# it's ~10% better for shortest key length and ~10% worse for longest
# one.
#
# In case it wasn't clear. The module has two distinct code paths:
# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit
# additions and 64-bit integer loads, not to mention specific
# instruction scheduling. In 64-bit build naturally only 2.0 code path
# is assembled. In 32-bit application context both code paths are
# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path
# is taken automatically. Also, in 32-bit build the module imposes
# couple of limitations: vector lengths has to be even and vector
# addresses has to be 64-bit aligned. Normally neither is a problem:
# most common key lengths are even and vectors are commonly malloc-ed,
# which ensures alignment.
#
# Special thanks to polarhome.com for providing HP-UX account on
# PA-RISC 1.1 machine, and to correspondent who chose to remain
# anonymous for testing the code on PA-RISC 2.0 machine.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
$BN_SZ =$SIZE_T;
} else {
$LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
$BN_SZ =$SIZE_T;
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+SIXTY_FOUR_BIT/) {
$BN_SZ=8;
$LEVEL="2.0";
last;
}
}
close CONF;
}
}
$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker
# [+ argument transfer]
$LOCALS=$FRAME-$FRAME_MARKER;
$FRAME+=32; # local variables
$tp="%r31";
$ti1="%r29";
$ti0="%r28";
$rp="%r26";
$ap="%r25";
$bp="%r24";
$np="%r23";
$n0="%r22"; # passed through stack in 32-bit
$num="%r21"; # passed through stack in 32-bit
$idx="%r20";
$arrsz="%r19";
$nm1="%r7";
$nm0="%r6";
$ab1="%r5";
$ab0="%r4";
$fp="%r3";
$hi1="%r2";
$hi0="%r1";
$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s
$fm0="%fr4"; $fti=$fm0;
$fbi="%fr5L";
$fn0="%fr5R";
$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8";
$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11";
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
.ALIGN 64
bn_mul_mont
.PROC
.CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
ldo -$FRAME(%sp),$fp
___
$code.=<<___ if ($SIZE_T==4);
ldw `-$FRAME_MARKER-4`($fp),$n0
ldw `-$FRAME_MARKER-8`($fp),$num
nop
nop ; alignment
___
$code.=<<___ if ($BN_SZ==4);
comiclr,<= 6,$num,%r0 ; are vectors long enough?
b L\$abort
ldi 0,%r28 ; signal "unhandled"
add,ev %r0,$num,$num ; is $num even?
b L\$abort
nop
or $ap,$np,$ti1
extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned?
b L\$abort
nop
nop ; alignment
nop
fldws 0($n0),${fn0}
fldws,ma 4($bp),${fbi} ; bp[0]
___
$code.=<<___ if ($BN_SZ==8);
comib,> 3,$num,L\$abort ; are vectors long enough?
ldi 0,%r28 ; signal "unhandled"
addl $num,$num,$num ; I operate on 32-bit values
fldws 4($n0),${fn0} ; only low part of n0
fldws 4($bp),${fbi} ; bp[0] in flipped word order
___
$code.=<<___;
fldds 0($ap),${fai} ; ap[0,1]
fldds 0($np),${fni} ; np[0,1]
sh2addl $num,%r0,$arrsz
ldi 31,$hi0
ldo 36($arrsz),$hi1 ; space for tp[num+1]
andcm $hi1,$hi0,$hi1 ; align
addl $hi1,%sp,%sp
$PUSH $fp,-$SIZE_T(%sp)
ldo `$LOCALS+16`($fp),$xfer
ldo `$LOCALS+32+4`($fp),$tp
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0]
xmpyu ${fn0},${fab0}R,${fm0}
addl $arrsz,$ap,$ap ; point at the end
addl $arrsz,$np,$np
subi 0,$arrsz,$idx ; j=0
ldo 8($idx),$idx ; j++++
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
fstds ${fab1},0($xfer)
fstds ${fnm1},8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
___
$code.=<<___ if ($BN_SZ==4);
mtctl $hi0,%cr11 ; $hi0 still holds 31
extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0
b L\$parisc11
nop
___
$code.=<<___; # PA-RISC 2.0 code-path
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $ab0,63,32,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
addl $ab0,$nm0,$nm0 ; low part is discarded
extrd,u $nm0,31,32,$hi1
L\$1st
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
stw $nm1,-4($tp) ; tp[j-1]
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,63,32,$ab1
addl $hi1,$nm1,$nm1
ldd -16($xfer),$ab0
addl $ab1,$nm1,$nm1
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
extrd,u $ab0,31,32,$hi0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[1]
___
$code.=<<___ if ($BN_SZ==8);
fldws 0($bp),${fbi} ; bp[1] in flipped word order
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
addl $hi0,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nm1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
addl $hi1,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2]
flddx $idx($np),${fni} ; np[2]
ldo 8($idx),$idx ; j++++
ldd -16($xfer),$ab0 ; 33-bit value
ldd -8($xfer),$nm0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
extrd,u $ab0,31,32,$ti0 ; carry bit
extrd,u $ab0,63,32,$ab0
fstds ${fab1},0($xfer)
addl $ti0,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
addl $ab0,$nm0,$nm0 ; low part is discarded
ldw 0($tp),$ti1 ; tp[1]
extrd,u $nm0,31,32,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
L\$inner
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
flddx $idx($ap),${fai} ; ap[j,j+1]
flddx $idx($np),${fni} ; np[j,j+1]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldw 4($tp),$ti0 ; tp[j]
stw $nm1,-4($tp) ; tp[j-1]
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldd -16($xfer),$ab0
fstds ${fab0},-16($xfer)
addl $hi0,$ti0,$ti0
addl $ti0,$ab0,$ab0
ldd -8($xfer),$nm0
fstds ${fnm0},-8($xfer)
extrd,u $ab0,31,32,$hi0
extrd,u $nm1,31,32,$hi1
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
addl $ab0,$nm0,$nm0
stw,ma $nm0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner ; j++++
extrd,u $nm0,31,32,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldd 0($xfer),$ab1
fstds ${fab1},0($xfer)
addl $hi0,$ti1,$ti1
addl $ti1,$ab1,$ab1
ldd 8($xfer),$nm1
fstds ${fnm1},8($xfer)
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
ldd -16($xfer),$ab0
ldd -8($xfer),$nm0
extrd,u $nm1,31,32,$hi1
addl $hi0,$ab0,$ab0
addl $ti0,$ab0,$ab0
stw $nm1,-4($tp) ; tp[j-1]
extrd,u $ab0,31,32,$hi0
ldw 8($tp),$ti1 ; tp[j]
extrd,u $ab0,63,32,$ab0
addl $hi1,$nm0,$nm0
ldd 0($xfer),$ab1
addl $ab0,$nm0,$nm0
ldd,mb 8($xfer),$nm1
extrd,u $nm0,31,32,$hi1
stw,ma $nm0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone ; i--
subi 0,$arrsz,$idx ; j=0
___
$code.=<<___ if ($BN_SZ==4);
fldws,ma 4($bp),${fbi} ; bp[i]
___
$code.=<<___ if ($BN_SZ==8);
ldi 12,$ti0 ; bp[i] in flipped word order
addl,ev %r0,$num,$num
ldi -4,$ti0
addl $ti0,$bp,$bp
fldws 0($bp),${fbi}
___
$code.=<<___;
flddx $idx($ap),${fai} ; ap[0]
addl $hi0,$ab1,$ab1
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
fstws,mb ${fab0}L,-8($xfer) ; save high part
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nm1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
addl $hi1,$hi0,$hi0
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone
addl $hi0,$ab1,$ab1
addl $ti1,$ab1,$ab1
extrd,u $ab1,31,32,$hi0
extrd,u $ab1,63,32,$ab1
ldw 4($tp),$ti0 ; tp[j]
addl $hi1,$nm1,$nm1
addl $ab1,$nm1,$nm1
extrd,u $nm1,31,32,$hi1
stw $nm1,-4($tp) ; tp[j-1]
addl $hi1,$hi0,$hi0
addl $ti0,$hi0,$hi0
extrd,u $hi0,31,32,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
___
$code.=<<___ if ($BN_SZ==4);
ldws,ma 4($tp),$ti0
extru,= $rp,31,3,%r0 ; is rp 64-bit aligned?
b L\$sub_pa11
addl $tp,$arrsz,$tp
L\$sub
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
___
$code.=<<___ if ($BN_SZ==8);
ldd,ma 8($tp),$ti0
L\$sub
ldd $idx($np),$hi0
shrpd $ti0,$ti0,32,$ti0 ; flip word order
std $ti0,-8($tp) ; save flipped value
sub,db $ti0,$hi0,$hi1
ldd,ma 8($tp),$ti0
addib,<> 8,$idx,L\$sub
std,ma $hi1,8($rp)
extrd,u $ti0,31,32,$ti0 ; carry in flipped word order
sub,db $ti0,%r0,$hi1
ldo -8($tp),$tp
___
$code.=<<___;
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy
ldd $idx($np),$hi0
std,ma %r0,8($tp)
addib,<> 8,$idx,.-8 ; L\$copy
std,ma $hi0,8($rp)
___
if ($BN_SZ==4) { # PA-RISC 1.1 code-path
$ablo=$ab0;
$abhi=$ab1;
$nmlo0=$nm0;
$nmhi0=$nm1;
$nmlo1="%r9";
$nmhi1="%r8";
$code.=<<___;
b L\$done
nop
.ALIGN 8
L\$parisc11
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -12($xfer),$ablo
ldw -16($xfer),$hi0
ldw -4($xfer),$nmlo0
ldw -8($xfer),$nmhi0
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldo 8($idx),$idx ; j++++
add $ablo,$nmlo0,$nmlo0 ; discarded
addc %r0,$nmhi0,$hi1
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
nop
L\$1st_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 12($xfer),$nmlo1
addc %r0,$abhi,$hi0
ldw 8($xfer),$nmhi1
add $ablo,$nmlo1,$nmlo1
fstds ${fab1},0($xfer)
addc %r0,$nmhi1,$nmhi1
fstds ${fnm1},8($xfer)
add $hi1,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$hi1
ldw -16($xfer),$abhi
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0]
ldw -4($xfer),$nmlo0
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -8($xfer),$nmhi0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
fstds ${fab0},-16($xfer)
add $ablo,$nmlo0,$nmlo0
fstds ${fnm0},-8($xfer)
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$1st_pa11 ; j++++
addc %r0,$nmhi0,$hi1
ldw 8($xfer),$nmhi1
ldw 12($xfer),$nmlo1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
add $hi0,$ablo,$ablo
fstds ${fab1},0($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm1},8($xfer)
add $ablo,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -8($xfer),$nmhi0
addc %r0,$nmhi1,$hi1
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
ldo -1($num),$num ; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[1]
flddx $idx($ap),${fai} ; ap[0,1]
flddx $idx($np),${fni} ; np[0,1]
fldws 8($xfer),${fti}R ; tp[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
stw $nmlo1,-4($tp) ; tp[j-1]
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
stw $hi0,0($tp)
stw $hi1,4($tp)
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
xmpyu ${fn0},${fab0}R,${fm0}
ldo `$LOCALS+32+4`($fp),$tp
L\$outer_pa11
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m
fstds ${fab0},-16($xfer) ; 33-bit value
fstds ${fnm0},-8($xfer)
flddx $idx($ap),${fai} ; ap[2,3]
flddx $idx($np),${fni} ; np[2,3]
ldw -16($xfer),$abhi ; carry bit actually
ldo 8($idx),$idx ; j++++
ldw -12($xfer),$ablo
ldw -8($xfer),$nmhi0
ldw -4($xfer),$nmlo0
ldw 0($xfer),$hi0 ; high part
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
fstds ${fab1},0($xfer)
addl $abhi,$hi0,$hi0 ; account carry bit
fstds ${fnm1},8($xfer)
add $ablo,$nmlo0,$nmlo0 ; discarded
ldw 0($tp),$ti1 ; tp[1]
addc %r0,$nmhi0,$hi1
fstds ${fab0},-16($xfer)
fstds ${fnm0},-8($xfer)
ldw 4($xfer),$ablo
ldw 0($xfer),$abhi
L\$inner_pa11
xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i]
flddx $idx($ap),${fai} ; ap[j,j+1]
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m
flddx $idx($np),${fni} ; np[j,j+1]
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
ldw 12($xfer),$nmlo1
add $ti1,$ablo,$ablo
ldw 8($xfer),$nmhi1
addc %r0,$abhi,$hi0
fstds ${fab1},0($xfer)
add $ablo,$nmlo1,$nmlo1
fstds ${fnm1},8($xfer)
addc %r0,$nmhi1,$nmhi1
ldw -12($xfer),$ablo
add $hi1,$nmlo1,$nmlo1
ldw -16($xfer),$abhi
addc %r0,$nmhi1,$hi1
xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i]
ldw 8($tp),$ti1 ; tp[j]
xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m
ldw -4($xfer),$nmlo0
add $hi0,$ablo,$ablo
ldw -8($xfer),$nmhi0
addc %r0,$abhi,$abhi
stw $nmlo1,-4($tp) ; tp[j-1]
add $ti0,$ablo,$ablo
fstds ${fab0},-16($xfer)
addc %r0,$abhi,$hi0
fstds ${fnm0},-8($xfer)
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldw 0($xfer),$abhi
add $hi1,$nmlo0,$nmlo0
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,<> 8,$idx,L\$inner_pa11 ; j++++
addc %r0,$nmhi0,$hi1
xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i]
ldw 12($xfer),$nmlo1
xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m
ldw 8($xfer),$nmhi1
add $hi0,$ablo,$ablo
ldw 4($tp),$ti0 ; tp[j]
addc %r0,$abhi,$abhi
fstds ${fab1},0($xfer)
add $ti1,$ablo,$ablo
fstds ${fnm1},8($xfer)
addc %r0,$abhi,$hi0
ldw -16($xfer),$abhi
add $ablo,$nmlo1,$nmlo1
ldw -12($xfer),$ablo
addc %r0,$nmhi1,$nmhi1
ldw -8($xfer),$nmhi0
add $hi1,$nmlo1,$nmlo1
ldw -4($xfer),$nmlo0
addc %r0,$nmhi1,$hi1
add $hi0,$ablo,$ablo
stw $nmlo1,-4($tp) ; tp[j-1]
addc %r0,$abhi,$abhi
add $ti0,$ablo,$ablo
ldw 8($tp),$ti1 ; tp[j]
addc %r0,$abhi,$hi0
ldw 0($xfer),$abhi
add $ablo,$nmlo0,$nmlo0
ldw 4($xfer),$ablo
addc %r0,$nmhi0,$nmhi0
ldws,mb 8($xfer),$nmhi1
add $hi1,$nmlo0,$nmlo0
ldw 4($xfer),$nmlo1
addc %r0,$nmhi0,$hi1
stws,ma $nmlo0,8($tp) ; tp[j-1]
addib,= -1,$num,L\$outerdone_pa11; i--
subi 0,$arrsz,$idx ; j=0
fldws,ma 4($bp),${fbi} ; bp[i]
flddx $idx($ap),${fai} ; ap[0]
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
flddx $idx($np),${fni} ; np[0]
fldws 8($xfer),${fti}R ; tp[0]
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldo 8($idx),$idx ; j++++
xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i]
xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i]
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
fstws,mb ${fab0}L,-8($xfer) ; save high part
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
fcpy,sgl %fr0,${fti}L ; zero high part
fcpy,sgl %fr0,${fab0}L
stw $nmlo1,-4($tp) ; tp[j-1]
fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double
fcnvxf,dbl,dbl ${fab0},${fab0}
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
fadd,dbl ${fti},${fab0},${fab0} ; add tp[0]
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int
stw $hi0,0($tp)
stw $hi1,4($tp)
xmpyu ${fn0},${fab0}R,${fm0}
b L\$outer_pa11
ldo `$LOCALS+32+4`($fp),$tp
L\$outerdone_pa11
add $hi0,$ablo,$ablo
addc %r0,$abhi,$abhi
add $ti1,$ablo,$ablo
addc %r0,$abhi,$hi0
ldw 4($tp),$ti0 ; tp[j]
add $hi1,$nmlo1,$nmlo1
addc %r0,$nmhi1,$nmhi1
add $ablo,$nmlo1,$nmlo1
addc %r0,$nmhi1,$hi1
stw $nmlo1,-4($tp) ; tp[j-1]
add $hi1,$hi0,$hi0
addc %r0,%r0,$hi1
add $ti0,$hi0,$hi0
addc %r0,$hi1,$hi1
stw $hi0,0($tp)
stw $hi1,4($tp)
ldo `$LOCALS+32+4`($fp),$tp
sub %r0,%r0,%r0 ; clear borrow
ldw -4($tp),$ti0
addl $tp,$arrsz,$tp
L\$sub_pa11
ldwx $idx($np),$hi0
subb $ti0,$hi0,$hi1
ldwx $idx($tp),$ti0
addib,<> 4,$idx,L\$sub_pa11
stws,ma $hi1,4($rp)
subb $ti0,%r0,$hi1
ldo -4($tp),$tp
and $tp,$hi1,$ap
andcm $rp,$hi1,$bp
or $ap,$bp,$np
sub $rp,$arrsz,$rp ; rewind rp
subi 0,$arrsz,$idx
ldo `$LOCALS+32`($fp),$tp
L\$copy_pa11
ldwx $idx($np),$hi0
stws,ma %r0,4($tp)
addib,<> 4,$idx,L\$copy_pa11
stws,ma $hi0,4($rp)
nop ; alignment
L\$done
___
}
$code.=<<___;
ldi 1,%r28 ; signal "handled"
ldo $FRAME($fp),%sp ; destroy tp[num+1]
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4
{ my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3;
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5
{ my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3;
$opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6
{ my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6);
$opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset
$opcode|=(1<<5) if ($mod =~ /^,m/);
$opcode|=(1<<13) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $sub = sub {
my ($mod,$args) = @_;
my $orig = "sub$mod\t$args";
if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) {
my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3;
$opcode|=(1<<10); # e1
$opcode|=(1<<8); # e2
$opcode|=(1<<5); # d
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
# flip word order in 64-bit mode...
s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8);
# assemble 2.0 instructions in 32-bit mode...
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4);
print $_,"\n";
}
close STDOUT;

224
crypto/pariscid.pl Normal file
View File

@ -0,0 +1,224 @@
#!/usr/bin/env perl
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$ST ="std";
} else {
$LEVEL ="1.1";
$SIZE_T =4;
$ST ="stw";
}
$rp="%r2";
$sp="%r30";
$rv="%r28";
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT OPENSSL_cpuid_setup,ENTRY
.ALIGN 8
OPENSSL_cpuid_setup
.PROC
.CALLINFO NO_CALLS
.ENTRY
bv ($rp)
.EXIT
nop
.PROCEND
.EXPORT OPENSSL_rdtsc,ENTRY
.ALIGN 8
OPENSSL_rdtsc
.PROC
.CALLINFO NO_CALLS
.ENTRY
mfctl %cr16,$rv
bv ($rp)
.EXIT
nop
.PROCEND
.EXPORT OPENSSL_wipe_cpu,ENTRY
.ALIGN 8
OPENSSL_wipe_cpu
.PROC
.CALLINFO NO_CALLS
.ENTRY
xor %r0,%r0,%r1
fcpy,dbl %fr0,%fr4
xor %r0,%r0,%r19
fcpy,dbl %fr0,%fr5
xor %r0,%r0,%r20
fcpy,dbl %fr0,%fr6
xor %r0,%r0,%r21
fcpy,dbl %fr0,%fr7
xor %r0,%r0,%r22
fcpy,dbl %fr0,%fr8
xor %r0,%r0,%r23
fcpy,dbl %fr0,%fr9
xor %r0,%r0,%r24
fcpy,dbl %fr0,%fr10
xor %r0,%r0,%r25
fcpy,dbl %fr0,%fr11
xor %r0,%r0,%r26
fcpy,dbl %fr0,%fr22
xor %r0,%r0,%r29
fcpy,dbl %fr0,%fr23
xor %r0,%r0,%r31
fcpy,dbl %fr0,%fr24
fcpy,dbl %fr0,%fr25
fcpy,dbl %fr0,%fr26
fcpy,dbl %fr0,%fr27
fcpy,dbl %fr0,%fr28
fcpy,dbl %fr0,%fr29
fcpy,dbl %fr0,%fr30
fcpy,dbl %fr0,%fr31
bv ($rp)
.EXIT
ldo 0($sp),$rv
.PROCEND
___
{
my $inp="%r26";
my $len="%r25";
$code.=<<___;
.EXPORT OPENSSL_cleanse,ENTRY,ARGW0=GR,ARGW1=GR
.ALIGN 8
OPENSSL_cleanse
.PROC
.CALLINFO NO_CALLS
.ENTRY
cmpib,*= 0,$len,Ldone
nop
cmpib,*>>= 15,$len,Little
ldi $SIZE_T-1,%r1
Lalign
and,*<> $inp,%r1,%r28
b,n Laligned
stb %r0,0($inp)
ldo -1($len),$len
b Lalign
ldo 1($inp),$inp
Laligned
andcm $len,%r1,%r28
Lot
$ST %r0,0($inp)
addib,*<> -$SIZE_T,%r28,Lot
ldo $SIZE_T($inp),$inp
and,*<> $len,%r1,$len
b,n Ldone
Little
stb %r0,0($inp)
addib,*<> -1,$len,Little
ldo 1($inp),$inp
Ldone
bv ($rp)
.EXIT
nop
.PROCEND
___
}
{
my ($out,$cnt,$max)=("%r26","%r25","%r24");
my ($tick,$lasttick)=("%r23","%r22");
my ($diff,$lastdiff)=("%r21","%r20");
$code.=<<___;
.EXPORT OPENSSL_instrument_bus,ENTRY,ARGW0=GR,ARGW1=GR
.ALIGN 8
OPENSSL_instrument_bus
.PROC
.CALLINFO NO_CALLS
.ENTRY
copy $cnt,$rv
mfctl %cr16,$tick
copy $tick,$lasttick
ldi 0,$diff
fdc 0($out)
ldw 0($out),$tick
add $diff,$tick,$tick
stw $tick,0($out)
Loop
mfctl %cr16,$tick
sub $tick,$lasttick,$diff
copy $tick,$lasttick
fdc 0($out)
ldw 0($out),$tick
add $diff,$tick,$tick
stw $tick,0($out)
addib,<> -1,$cnt,Loop
addi 4,$out,$out
bv ($rp)
.EXIT
sub $rv,$cnt,$rv
.PROCEND
.EXPORT OPENSSL_instrument_bus2,ENTRY,ARGW0=GR,ARGW1=GR
.ALIGN 8
OPENSSL_instrument_bus2
.PROC
.CALLINFO NO_CALLS
.ENTRY
copy $cnt,$rv
sub %r0,$cnt,$cnt
mfctl %cr16,$tick
copy $tick,$lasttick
ldi 0,$diff
fdc 0($out)
ldw 0($out),$tick
add $diff,$tick,$tick
stw $tick,0($out)
mfctl %cr16,$tick
sub $tick,$lasttick,$diff
copy $tick,$lasttick
Loop2
copy $diff,$lastdiff
fdc 0($out)
ldw 0($out),$tick
add $diff,$tick,$tick
stw $tick,0($out)
addib,= -1,$max,Ldone2
nop
mfctl %cr16,$tick
sub $tick,$lasttick,$diff
copy $tick,$lasttick
cmpclr,<> $lastdiff,$diff,$tick
ldi 1,$tick
ldi 1,%r1
xor %r1,$tick,$tick
addb,<> $tick,$cnt,Loop2
shladd,l $tick,2,$out,$out
Ldone2
bv ($rp)
.EXIT
add $rv,$cnt,$rv
.PROCEND
___
}
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
$code =~ s/,\*/,/gm if ($SIZE_T==4);
print $code;
close STDOUT;

View File

@ -0,0 +1,313 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# RC4 for PA-RISC.
# June 2009.
#
# Performance is 33% better than gcc 3.2 generated code on PA-7100LC.
# For reference, [4x] unrolled loop is >40% faster than folded one.
# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement
# is believed to be not sufficient to justify the effort...
#
# Special thanks to polarhome.com for providing HP-UX account.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker
# [+ argument transfer]
$SZ=1; # defaults to RC4_CHAR
if (open CONF,"<${dir}../../opensslconf.h") {
while(<CONF>) {
if (m/#\s*define\s+RC4_INT\s+(.*)/) {
$SZ = ($1=~/char$/) ? 1 : 4;
last;
}
}
close CONF;
}
if ($SZ==1) { # RC4_CHAR
$LD="ldb";
$LDX="ldbx";
$MKX="addl";
$ST="stb";
} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC)
$LD="ldw";
$LDX="ldwx,s";
$MKX="sh2addl";
$ST="stw";
}
$key="%r26";
$len="%r25";
$inp="%r24";
$out="%r23";
@XX=("%r19","%r20");
@TX=("%r21","%r22");
$YY="%r28";
$TY="%r29";
$acc="%r1";
$ix="%r2";
$iy="%r3";
$dat0="%r4";
$dat1="%r5";
$rem="%r6";
$mask="%r31";
sub unrolledloopbody {
for ($i=0;$i<4;$i++) {
$code.=<<___;
ldo 1($XX[0]),$XX[1]
`sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)`
and $mask,$XX[1],$XX[1]
$LDX $YY($key),$TY
$MKX $YY,$key,$ix
$LDX $XX[1]($key),$TX[1]
$MKX $XX[0],$key,$iy
$ST $TX[0],0($ix)
comclr,<> $XX[1],$YY,%r0 ; conditional
copy $TX[0],$TX[1] ; move
`sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)`
$ST $TY,0($iy)
addl $TX[0],$TY,$TY
addl $TX[1],$YY,$YY
and $mask,$TY,$TY
and $mask,$YY,$YY
___
push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers
} }
sub foldedloop {
my ($label,$count)=@_;
$code.=<<___;
$label
$MKX $YY,$key,$iy
$LDX $YY($key),$TY
$MKX $XX[0],$key,$ix
$ST $TX[0],0($iy)
ldo 1($XX[0]),$XX[0]
$ST $TY,0($ix)
addl $TX[0],$TY,$TY
ldbx $inp($out),$dat1
and $mask,$TY,$TY
and $mask,$XX[0],$XX[0]
$LDX $TY($key),$acc
$LDX $XX[0]($key),$TX[0]
ldo 1($out),$out
xor $dat1,$acc,$acc
addl $TX[0],$YY,$YY
stb $acc,-1($out)
addib,<> -1,$count,$label ; $count is always small
and $mask,$YY,$YY
___
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
RC4
.PROC
.CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
cmpib,*= 0,$len,L\$abort
sub $inp,$out,$inp ; distance between $inp and $out
$LD `0*$SZ`($key),$XX[0]
$LD `1*$SZ`($key),$YY
ldo `2*$SZ`($key),$key
ldi 0xff,$mask
ldi 3,$dat0
ldo 1($XX[0]),$XX[0] ; warm up loop
and $mask,$XX[0],$XX[0]
$LDX $XX[0]($key),$TX[0]
addl $TX[0],$YY,$YY
cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother?
and $mask,$YY,$YY
and,<> $out,$dat0,$rem ; is $out aligned?
b L\$alignedout
subi 4,$rem,$rem
sub $len,$rem,$len
___
&foldedloop("L\$alignout",$rem); # process till $out is aligned
$code.=<<___;
L\$alignedout ; $len is at least 4 here
and,<> $inp,$dat0,$acc ; is $inp aligned?
b L\$oop4
sub $inp,$acc,$rem ; align $inp
sh3addl $acc,%r0,$acc
subi 32,$acc,$acc
mtctl $acc,%cr11 ; load %sar with vshd align factor
ldwx $rem($out),$dat0
ldo 4($rem),$rem
L\$oop4misalignedinp
___
&unrolledloopbody();
$code.=<<___;
$LDX $TY($key),$ix
ldwx $rem($out),$dat1
ldo -4($len),$len
or $ix,$acc,$acc ; last piece, no need to dep
vshd $dat0,$dat1,$iy ; align data
copy $dat1,$dat0
xor $iy,$acc,$acc
stw $acc,0($out)
cmpib,*<< 3,$len,L\$oop4misalignedinp
ldo 4($out),$out
cmpib,*= 0,$len,L\$done
nop
b L\$oop1
nop
.ALIGN 8
L\$oop4
___
&unrolledloopbody();
$code.=<<___;
$LDX $TY($key),$ix
ldwx $inp($out),$dat0
ldo -4($len),$len
or $ix,$acc,$acc ; last piece, no need to dep
xor $dat0,$acc,$acc
stw $acc,0($out)
cmpib,*<< 3,$len,L\$oop4
ldo 4($out),$out
cmpib,*= 0,$len,L\$done
nop
___
&foldedloop("L\$oop1",$len);
$code.=<<___;
L\$done
$POP `-$FRAME-$SAVED_RP`(%sp),%r2
ldo -1($XX[0]),$XX[0] ; chill out loop
sub $YY,$TX[0],$YY
and $mask,$XX[0],$XX[0]
and $mask,$YY,$YY
$ST $XX[0],`-2*$SZ`($key)
$ST $YY,`-1*$SZ`($key)
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
L\$abort
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
___
$code.=<<___;
.EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
.ALIGN 8
RC4_set_key
.PROC
.CALLINFO NO_CALLS
.ENTRY
$ST %r0,`0*$SZ`($key)
$ST %r0,`1*$SZ`($key)
ldo `2*$SZ`($key),$key
copy %r0,@XX[0]
L\$1st
$ST @XX[0],0($key)
ldo 1(@XX[0]),@XX[0]
bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256
ldo $SZ($key),$key
ldo `-256*$SZ`($key),$key ; rewind $key
addl $len,$inp,$inp ; $inp to point at the end
sub %r0,$len,%r23 ; inverse index
copy %r0,@XX[0]
copy %r0,@XX[1]
ldi 0xff,$mask
L\$2nd
$LDX @XX[0]($key),@TX[0]
ldbx %r23($inp),@TX[1]
addi,nuv 1,%r23,%r23 ; increment and conditional
sub %r0,$len,%r23 ; inverse index
addl @TX[0],@XX[1],@XX[1]
addl @TX[1],@XX[1],@XX[1]
and $mask,@XX[1],@XX[1]
$MKX @XX[0],$key,$TY
$LDX @XX[1]($key),@TX[1]
$MKX @XX[1],$key,$YY
ldo 1(@XX[0]),@XX[0]
$ST @TX[0],0($YY)
bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256
$ST @TX[1],0($TY)
bv,n (%r2)
.EXIT
nop
.PROCEND
.EXPORT RC4_options,ENTRY
.ALIGN 8
RC4_options
.PROC
.CALLINFO NO_CALLS
.ENTRY
blr %r0,%r28
ldi 3,%r1
L\$pic
andcm %r28,%r1,%r28
bv (%r2)
.EXIT
ldo L\$opts-L\$pic(%r28),%r28
.PROCEND
.ALIGN 8
L\$opts
.STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)"
.STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/cmpib,\*/comib,/gm if ($SIZE_T==4);
print $code;
close STDOUT;

View File

@ -0,0 +1,259 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA1 block procedure for PA-RISC.
# June 2009.
#
# On PA-7100LC performance is >30% better than gcc 3.2 generated code
# for aligned input and >50% better for unaligned. Compared to vendor
# compiler on PA-8600 it's almost 60% faster in 64-bit build and just
# few percent faster in 32-bit one (this for aligned input, data for
# unaligned input is not available).
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
$FRAME=14*$SIZE_T+$FRAME_MARKER;# 14 saved regs + frame marker
# [+ argument transfer]
$ctx="%r26"; # arg0
$inp="%r25"; # arg1
$num="%r24"; # arg2
$t0="%r28";
$t1="%r29";
$K="%r31";
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$t0);
@V=($A,$B,$C,$D,$E)=("%r19","%r20","%r21","%r22","%r23");
sub BODY_00_19 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<15);
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
addl @X[$i],$e,$e
and $c,$b,$t0
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
addl $t0,$e,$e
___
$code.=<<___ if ($i>=15); # with forward Xupdate
addl $K,$e,$e ; $i
shd $a,$a,27,$t1
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
and $c,$b,$t0
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
andcm $d,$b,$t1
shd $b,$b,2,$b
or $t1,$t0,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
add $t0,$e,$e
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
___
}
sub BODY_20_39 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___ if ($i<79);
xor @X[($j+2)%16],@X[$j%16],@X[$j%16] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t0,$e,$e
___
$code.=<<___ if ($i==79); # with context load
ldw 0($ctx),@X[0] ; $i
addl $K,$e,$e
shd $a,$a,27,$t1
ldw 4($ctx),@X[1]
addl @X[$i%16],$e,$e
xor $b,$c,$t0
ldw 8($ctx),@X[2]
addl $t1,$e,$e
shd $b,$b,2,$b
xor $d,$t0,$t0
ldw 12($ctx),@X[3]
addl $t0,$e,$e
ldw 16($ctx),@X[4]
___
}
sub BODY_40_59 {
my ($i,$a,$b,$c,$d,$e)=@_;
my $j=$i+1;
$code.=<<___;
shd $a,$a,27,$t1 ; $i
addl $K,$e,$e
xor @X[($j+2)%16],@X[$j%16],@X[$j%16]
xor $d,$c,$t0
addl @X[$i%16],$e,$e
xor @X[($j+8)%16],@X[$j%16],@X[$j%16]
and $b,$t0,$t0
addl $t1,$e,$e
shd $b,$b,2,$b
xor @X[($j+13)%16],@X[$j%16],@X[$j%16]
addl $t0,$e,$e
and $d,$c,$t1
shd @X[$j%16],@X[$j%16],31,@X[$j%16]
addl $t1,$e,$e
___
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.EXPORT sha1_block_data_order,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
sha1_block_data_order
.PROC
.CALLINFO FRAME=`$FRAME-14*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=16
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
ldw 0($ctx),$A
ldw 4($ctx),$B
ldw 8($ctx),$C
ldw 12($ctx),$D
ldw 16($ctx),$E
extru $inp,31,2,$t0 ; t0=inp&3;
sh3addl $t0,%r0,$t0 ; t0*=8;
subi 32,$t0,$t0 ; t0=32-t0;
mtctl $t0,%cr11 ; %sar=t0;
L\$oop
ldi 3,$t0
andcm $inp,$t0,$t0 ; 64-bit neutral
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\tldw `4*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
ldw 60($t0),@X[15]
ldw 64($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align input
$code.="\tvshd @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
ldil L'0x5a827000,$K ; K_00_19
ldo 0x999($K),$K
___
for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x6ed9e000,$K ; K_20_39
ldo 0xba1($K),$K
___
for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0x8f1bb000,$K ; K_40_59
ldo 0xcdc($K),$K
___
for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
ldil L'0xca62c000,$K ; K_60_79
ldo 0x1d6($K),$K
___
for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
addl @X[0],$A,$A
addl @X[1],$B,$B
addl @X[2],$C,$C
addl @X[3],$D,$D
addl @X[4],$E,$E
stw $A,0($ctx)
stw $B,4($ctx)
stw $C,8($ctx)
stw $D,12($ctx)
stw $E,16($ctx)
addib,*<> -1,$num,L\$oop
ldo 64($inp),$inp
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA1 block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/,\*/,/gm if ($SIZE_T==4);
print $code;
close STDOUT;

791
crypto/sha/asm/sha512-parisc.pl Executable file
View File

@ -0,0 +1,791 @@
#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# SHA256/512 block procedure for PA-RISC.
# June 2009.
#
# SHA256 performance is >75% better than gcc 3.2 generated code on
# PA-7100LC. Compared to code generated by vendor compiler this
# implementation is almost 70% faster in 64-bit build, but delivers
# virtually same performance in 32-bit build on PA-8600.
#
# SHA512 performance is >2.9x better than gcc 3.2 generated code on
# PA-7100LC, PA-RISC 1.1 processor. Then implementation detects if the
# code is executed on PA-RISC 2.0 processor and switches to 64-bit
# code path delivering adequate peformance even in "blended" 32-bit
# build. Though 64-bit code is not any faster than code generated by
# vendor compiler on PA-8600...
#
# Special thanks to polarhome.com for providing HP-UX account.
$flavour = shift;
$output = shift;
open STDOUT,">$output";
if ($flavour =~ /64/) {
$LEVEL ="2.0W";
$SIZE_T =8;
$FRAME_MARKER =80;
$SAVED_RP =16;
$PUSH ="std";
$PUSHMA ="std,ma";
$POP ="ldd";
$POPMB ="ldd,mb";
} else {
$LEVEL ="1.0";
$SIZE_T =4;
$FRAME_MARKER =48;
$SAVED_RP =20;
$PUSH ="stw";
$PUSHMA ="stwm";
$POP ="ldw";
$POPMB ="ldwm";
}
if ($output =~ /512/) {
$func="sha512_block_data_order";
$SZ=8;
@Sigma0=(28,34,39);
@Sigma1=(14,18,41);
@sigma0=(1, 8, 7);
@sigma1=(19,61, 6);
$rounds=80;
$LAST10BITS=0x017;
$LD="ldd";
$LDM="ldd,ma";
$ST="std";
} else {
$func="sha256_block_data_order";
$SZ=4;
@Sigma0=( 2,13,22);
@Sigma1=( 6,11,25);
@sigma0=( 7,18, 3);
@sigma1=(17,19,10);
$rounds=64;
$LAST10BITS=0x0f2;
$LD="ldw";
$LDM="ldwm";
$ST="stw";
}
$FRAME=16*$SIZE_T+$FRAME_MARKER;# 16 saved regs + frame marker
# [+ argument transfer]
$XOFF=16*$SZ+32; # local variables
$FRAME+=$XOFF;
$XOFF+=$FRAME_MARKER; # distance between %sp and local variables
$ctx="%r26"; # zapped by $a0
$inp="%r25"; # zapped by $a1
$num="%r24"; # zapped by $t0
$a0 ="%r26";
$a1 ="%r25";
$t0 ="%r24";
$t1 ="%r29";
$Tbl="%r31";
@V=($A,$B,$C,$D,$E,$F,$G,$H)=("%r17","%r18","%r19","%r20","%r21","%r22","%r23","%r28");
@X=("%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9", "%r10","%r11","%r12","%r13","%r14","%r15","%r16",$inp);
sub ROUND_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$code.=<<___;
_ror $e,$Sigma1[0],$a0
and $f,$e,$t0
_ror $e,$Sigma1[1],$a1
addl $t1,$h,$h
andcm $g,$e,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma1[2]-$Sigma1[1]`,$a1
or $t0,$t1,$t1 ; Ch(e,f,g)
addl @X[$i%16],$h,$h
xor $a0,$a1,$a1 ; Sigma1(e)
addl $t1,$h,$h
_ror $a,$Sigma0[0],$a0
addl $a1,$h,$h
_ror $a,$Sigma0[1],$a1
and $a,$b,$t0
and $a,$c,$t1
xor $a1,$a0,$a0
_ror $a1,`$Sigma0[2]-$Sigma0[1]`,$a1
xor $t1,$t0,$t0
and $b,$c,$t1
xor $a0,$a1,$a1 ; Sigma0(a)
addl $h,$d,$d
xor $t1,$t0,$t0 ; Maj(a,b,c)
`"$LDM $SZ($Tbl),$t1" if ($i<15)`
addl $a1,$h,$h
addl $t0,$h,$h
___
}
sub ROUND_16_xx {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
$i-=16;
$code.=<<___;
_ror @X[($i+1)%16],$sigma0[0],$a0
_ror @X[($i+1)%16],$sigma0[1],$a1
addl @X[($i+9)%16],@X[$i],@X[$i]
_ror @X[($i+14)%16],$sigma1[0],$t0
_ror @X[($i+14)%16],$sigma1[1],$t1
xor $a1,$a0,$a0
_shr @X[($i+1)%16],$sigma0[2],$a1
xor $t1,$t0,$t0
_shr @X[($i+14)%16],$sigma1[2],$t1
xor $a1,$a0,$a0 ; sigma0(X[(i+1)&0x0f])
xor $t1,$t0,$t0 ; sigma1(X[(i+14)&0x0f])
$LDM $SZ($Tbl),$t1
addl $a0,@X[$i],@X[$i]
addl $t0,@X[$i],@X[$i]
___
$code.=<<___ if ($i==15);
extru $t1,31,10,$a1
comiclr,<> $LAST10BITS,$a1,%r0
ldo 1($Tbl),$Tbl ; signal end of $Tbl
___
&ROUND_00_15($i+16,$a,$b,$c,$d,$e,$f,$g,$h);
}
$code=<<___;
.LEVEL $LEVEL
.SPACE \$TEXT\$
.SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY
.ALIGN 64
L\$table
___
$code.=<<___ if ($SZ==8);
.WORD 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd
.WORD 0xb5c0fbcf,0xec4d3b2f,0xe9b5dba5,0x8189dbbc
.WORD 0x3956c25b,0xf348b538,0x59f111f1,0xb605d019
.WORD 0x923f82a4,0xaf194f9b,0xab1c5ed5,0xda6d8118
.WORD 0xd807aa98,0xa3030242,0x12835b01,0x45706fbe
.WORD 0x243185be,0x4ee4b28c,0x550c7dc3,0xd5ffb4e2
.WORD 0x72be5d74,0xf27b896f,0x80deb1fe,0x3b1696b1
.WORD 0x9bdc06a7,0x25c71235,0xc19bf174,0xcf692694
.WORD 0xe49b69c1,0x9ef14ad2,0xefbe4786,0x384f25e3
.WORD 0x0fc19dc6,0x8b8cd5b5,0x240ca1cc,0x77ac9c65
.WORD 0x2de92c6f,0x592b0275,0x4a7484aa,0x6ea6e483
.WORD 0x5cb0a9dc,0xbd41fbd4,0x76f988da,0x831153b5
.WORD 0x983e5152,0xee66dfab,0xa831c66d,0x2db43210
.WORD 0xb00327c8,0x98fb213f,0xbf597fc7,0xbeef0ee4
.WORD 0xc6e00bf3,0x3da88fc2,0xd5a79147,0x930aa725
.WORD 0x06ca6351,0xe003826f,0x14292967,0x0a0e6e70
.WORD 0x27b70a85,0x46d22ffc,0x2e1b2138,0x5c26c926
.WORD 0x4d2c6dfc,0x5ac42aed,0x53380d13,0x9d95b3df
.WORD 0x650a7354,0x8baf63de,0x766a0abb,0x3c77b2a8
.WORD 0x81c2c92e,0x47edaee6,0x92722c85,0x1482353b
.WORD 0xa2bfe8a1,0x4cf10364,0xa81a664b,0xbc423001
.WORD 0xc24b8b70,0xd0f89791,0xc76c51a3,0x0654be30
.WORD 0xd192e819,0xd6ef5218,0xd6990624,0x5565a910
.WORD 0xf40e3585,0x5771202a,0x106aa070,0x32bbd1b8
.WORD 0x19a4c116,0xb8d2d0c8,0x1e376c08,0x5141ab53
.WORD 0x2748774c,0xdf8eeb99,0x34b0bcb5,0xe19b48a8
.WORD 0x391c0cb3,0xc5c95a63,0x4ed8aa4a,0xe3418acb
.WORD 0x5b9cca4f,0x7763e373,0x682e6ff3,0xd6b2b8a3
.WORD 0x748f82ee,0x5defb2fc,0x78a5636f,0x43172f60
.WORD 0x84c87814,0xa1f0ab72,0x8cc70208,0x1a6439ec
.WORD 0x90befffa,0x23631e28,0xa4506ceb,0xde82bde9
.WORD 0xbef9a3f7,0xb2c67915,0xc67178f2,0xe372532b
.WORD 0xca273ece,0xea26619c,0xd186b8c7,0x21c0c207
.WORD 0xeada7dd6,0xcde0eb1e,0xf57d4f7f,0xee6ed178
.WORD 0x06f067aa,0x72176fba,0x0a637dc5,0xa2c898a6
.WORD 0x113f9804,0xbef90dae,0x1b710b35,0x131c471b
.WORD 0x28db77f5,0x23047d84,0x32caab7b,0x40c72493
.WORD 0x3c9ebe0a,0x15c9bebc,0x431d67c4,0x9c100d4c
.WORD 0x4cc5d4be,0xcb3e42b6,0x597f299c,0xfc657e2a
.WORD 0x5fcb6fab,0x3ad6faec,0x6c44198c,0x4a475817
___
$code.=<<___ if ($SZ==4);
.WORD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
.WORD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
.WORD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
.WORD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
.WORD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
.WORD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
.WORD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
.WORD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
.WORD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
.WORD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
.WORD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
.WORD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
.WORD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
.WORD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
.WORD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
.WORD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
___
$code.=<<___;
.EXPORT $func,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR
.ALIGN 64
$func
.PROC
.CALLINFO FRAME=`$FRAME-16*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=18
.ENTRY
$PUSH %r2,-$SAVED_RP(%sp) ; standard prologue
$PUSHMA %r3,$FRAME(%sp)
$PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp)
$PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp)
$PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp)
$PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp)
$PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp)
$PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp)
$PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp)
$PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp)
$PUSH %r12,`-$FRAME+9*$SIZE_T`(%sp)
$PUSH %r13,`-$FRAME+10*$SIZE_T`(%sp)
$PUSH %r14,`-$FRAME+11*$SIZE_T`(%sp)
$PUSH %r15,`-$FRAME+12*$SIZE_T`(%sp)
$PUSH %r16,`-$FRAME+13*$SIZE_T`(%sp)
$PUSH %r17,`-$FRAME+14*$SIZE_T`(%sp)
$PUSH %r18,`-$FRAME+15*$SIZE_T`(%sp)
_shl $num,`log(16*$SZ)/log(2)`,$num
addl $inp,$num,$num ; $num to point at the end of $inp
$PUSH $num,`-$FRAME_MARKER-4*$SIZE_T`(%sp) ; save arguments
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp)
$PUSH $ctx,`-$FRAME_MARKER-2*$SIZE_T`(%sp)
blr %r0,$Tbl
ldi 3,$t1
L\$pic
andcm $Tbl,$t1,$Tbl ; wipe privilege level
ldo L\$table-L\$pic($Tbl),$Tbl
___
$code.=<<___ if ($SZ==8 && $SIZE_T==4);
ldi 31,$t1
mtctl $t1,%cr11
extrd,u,*= $t1,%sar,1,$t1 ; executes on PA-RISC 1.0
b L\$parisc1
nop
___
$code.=<<___;
$LD `0*$SZ`($ctx),$A ; load context
$LD `1*$SZ`($ctx),$B
$LD `2*$SZ`($ctx),$C
$LD `3*$SZ`($ctx),$D
$LD `4*$SZ`($ctx),$E
$LD `5*$SZ`($ctx),$F
$LD `6*$SZ`($ctx),$G
$LD `7*$SZ`($ctx),$H
extru $inp,31,`log($SZ)/log(2)`,$t0
sh3addl $t0,%r0,$t0
subi `8*$SZ`,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop
ldi `$SZ-1`,$t0
$LDM $SZ($Tbl),$t1
andcm $inp,$t0,$t0 ; align $inp
___
for ($i=0;$i<15;$i++) { # load input block
$code.="\t$LD `$SZ*$i`($t0),@X[$i]\n"; }
$code.=<<___;
cmpb,*= $inp,$t0,L\$aligned
$LD `$SZ*15`($t0),@X[15]
$LD `$SZ*16`($t0),@X[16]
___
for ($i=0;$i<16;$i++) { # align data
$code.="\t_align @X[$i],@X[$i+1],@X[$i]\n"; }
$code.=<<___;
L\$aligned
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
L\$rounds
nop ; otherwise /usr/ccs/bin/as is confused by below .WORD
___
for(;$i<32;$i++) { &ROUND_16_xx($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bb,>= $Tbl,31,L\$rounds ; end of $Tbl signalled?
nop
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ-1`($Tbl),$Tbl ; rewind $Tbl
$LD `0*$SZ`($ctx),@X[0] ; load context
$LD `1*$SZ`($ctx),@X[1]
$LD `2*$SZ`($ctx),@X[2]
$LD `3*$SZ`($ctx),@X[3]
$LD `4*$SZ`($ctx),@X[4]
$LD `5*$SZ`($ctx),@X[5]
addl @X[0],$A,$A
$LD `6*$SZ`($ctx),@X[6]
addl @X[1],$B,$B
$LD `7*$SZ`($ctx),@X[7]
ldo `16*$SZ`($inp),$inp ; advance $inp
$ST $A,`0*$SZ`($ctx) ; save context
addl @X[2],$C,$C
$ST $B,`1*$SZ`($ctx)
addl @X[3],$D,$D
$ST $C,`2*$SZ`($ctx)
addl @X[4],$E,$E
$ST $D,`3*$SZ`($ctx)
addl @X[5],$F,$F
$ST $E,`4*$SZ`($ctx)
addl @X[6],$G,$G
$ST $F,`5*$SZ`($ctx)
addl @X[7],$H,$H
$ST $G,`6*$SZ`($ctx)
$ST $H,`7*$SZ`($ctx)
cmpb,*<>,n $inp,$num,L\$oop
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
___
if ($SZ==8 && $SIZE_T==4) # SHA512 for 32-bit PA-RISC 1.0
{{
$code.=<<___;
b L\$done
nop
.ALIGN 64
L\$parisc1
___
@V=( $Ahi, $Alo, $Bhi, $Blo, $Chi, $Clo, $Dhi, $Dlo,
$Ehi, $Elo, $Fhi, $Flo, $Ghi, $Glo, $Hhi, $Hlo) =
( "%r1", "%r2", "%r3", "%r4", "%r5", "%r6", "%r7", "%r8",
"%r9","%r10","%r11","%r12","%r13","%r14","%r15","%r16");
$a0 ="%r17";
$a1 ="%r18";
$a2 ="%r19";
$a3 ="%r20";
$t0 ="%r21";
$t1 ="%r22";
$t2 ="%r28";
$t3 ="%r29";
$Tbl="%r31";
@X=("%r23","%r24","%r25","%r26"); # zaps $num,$inp,$ctx
sub ROUND_00_15_pa1 {
my ($i,$ahi,$alo,$bhi,$blo,$chi,$clo,$dhi,$dlo,
$ehi,$elo,$fhi,$flo,$ghi,$glo,$hhi,$hlo,$flag)=@_;
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
$code.=<<___ if (!$flag);
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
___
$code.=<<___;
shd $ehi,$elo,$Sigma1[0],$t0
add $Xlo,$hlo,$hlo
shd $elo,$ehi,$Sigma1[0],$t1
addc $Xhi,$hhi,$hhi ; h += X[i]
shd $ehi,$elo,$Sigma1[1],$t2
ldwm 8($Tbl),$Xhi
shd $elo,$ehi,$Sigma1[1],$t3
ldw -4($Tbl),$Xlo ; load K[i]
xor $t2,$t0,$t0
xor $t3,$t1,$t1
and $flo,$elo,$a0
and $fhi,$ehi,$a1
shd $ehi,$elo,$Sigma1[2],$t2
andcm $glo,$elo,$a2
shd $elo,$ehi,$Sigma1[2],$t3
andcm $ghi,$ehi,$a3
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma1(e)
add $Xlo,$hlo,$hlo
xor $a2,$a0,$a0
addc $Xhi,$hhi,$hhi ; h += K[i]
xor $a3,$a1,$a1 ; Ch(e,f,g)
add $t0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[0],$t0
addc $t1,$hhi,$hhi ; h += Sigma1(e)
shd $alo,$ahi,$Sigma0[0],$t1
add $a0,$hlo,$hlo
shd $ahi,$alo,$Sigma0[1],$t2
addc $a1,$hhi,$hhi ; h += Ch(e,f,g)
shd $alo,$ahi,$Sigma0[1],$t3
xor $t2,$t0,$t0
xor $t3,$t1,$t1
shd $ahi,$alo,$Sigma0[2],$t2
and $alo,$blo,$a0
shd $alo,$ahi,$Sigma0[2],$t3
and $ahi,$bhi,$a1
xor $t2,$t0,$t0
xor $t3,$t1,$t1 ; Sigma0(a)
and $alo,$clo,$a2
and $ahi,$chi,$a3
xor $a2,$a0,$a0
add $hlo,$dlo,$dlo
xor $a3,$a1,$a1
addc $hhi,$dhi,$dhi ; d += h
and $blo,$clo,$a2
add $t0,$hlo,$hlo
and $bhi,$chi,$a3
addc $t1,$hhi,$hhi ; h += Sigma0(a)
xor $a2,$a0,$a0
add $a0,$hlo,$hlo
xor $a3,$a1,$a1 ; Maj(a,b,c)
addc $a1,$hhi,$hhi ; h += Maj(a,b,c)
___
$code.=<<___ if ($i==15 && $flag);
extru $Xlo,31,10,$Xlo
comiclr,= $LAST10BITS,$Xlo,%r0
b L\$rounds_pa1
nop
___
push(@X,shift(@X)); push(@X,shift(@X));
}
sub ROUND_16_xx_pa1 {
my ($Xhi,$Xlo,$Xnhi,$Xnlo) = @X;
my ($i)=shift;
$i-=16;
$code.=<<___;
ldw `-$XOFF+8*(($i+1)%16)`(%sp),$Xnhi
ldw `-$XOFF+8*(($i+1)%16)+4`(%sp),$Xnlo ; load X[i+1]
ldw `-$XOFF+8*(($i+9)%16)`(%sp),$a1
ldw `-$XOFF+8*(($i+9)%16)+4`(%sp),$a0 ; load X[i+9]
ldw `-$XOFF+8*(($i+14)%16)`(%sp),$a3
ldw `-$XOFF+8*(($i+14)%16)+4`(%sp),$a2 ; load X[i+14]
shd $Xnhi,$Xnlo,$sigma0[0],$t0
shd $Xnlo,$Xnhi,$sigma0[0],$t1
add $a0,$Xlo,$Xlo
shd $Xnhi,$Xnlo,$sigma0[1],$t2
addc $a1,$Xhi,$Xhi
shd $Xnlo,$Xnhi,$sigma0[1],$t3
xor $t2,$t0,$t0
shd $Xnhi,$Xnlo,$sigma0[2],$t2
xor $t3,$t1,$t1
extru $Xnhi,`31-$sigma0[2]`,`32-$sigma0[2]`,$t3
xor $t2,$t0,$t0
shd $a3,$a2,$sigma1[0],$a0
xor $t3,$t1,$t1 ; sigma0(X[i+1)&0x0f])
shd $a2,$a3,$sigma1[0],$a1
add $t0,$Xlo,$Xlo
shd $a3,$a2,$sigma1[1],$t2
addc $t1,$Xhi,$Xhi
shd $a2,$a3,$sigma1[1],$t3
xor $t2,$a0,$a0
shd $a3,$a2,$sigma1[2],$t2
xor $t3,$a1,$a1
extru $a3,`31-$sigma1[2]`,`32-$sigma1[2]`,$t3
xor $t2,$a0,$a0
xor $t3,$a1,$a1 ; sigma0(X[i+14)&0x0f])
add $a0,$Xlo,$Xlo
addc $a1,$Xhi,$Xhi
stw $Xhi,`-$XOFF+8*($i%16)`(%sp)
stw $Xlo,`-$XOFF+8*($i%16)+4`(%sp)
___
&ROUND_00_15_pa1($i,@_,1);
}
$code.=<<___;
ldw `0*4`($ctx),$Ahi ; load context
ldw `1*4`($ctx),$Alo
ldw `2*4`($ctx),$Bhi
ldw `3*4`($ctx),$Blo
ldw `4*4`($ctx),$Chi
ldw `5*4`($ctx),$Clo
ldw `6*4`($ctx),$Dhi
ldw `7*4`($ctx),$Dlo
ldw `8*4`($ctx),$Ehi
ldw `9*4`($ctx),$Elo
ldw `10*4`($ctx),$Fhi
ldw `11*4`($ctx),$Flo
ldw `12*4`($ctx),$Ghi
ldw `13*4`($ctx),$Glo
ldw `14*4`($ctx),$Hhi
ldw `15*4`($ctx),$Hlo
extru $inp,31,2,$t0
sh3addl $t0,%r0,$t0
subi 32,$t0,$t0
mtctl $t0,%cr11 ; load %sar with align factor
L\$oop_pa1
extru $inp,31,2,$a3
comib,= 0,$a3,L\$aligned_pa1
sub $inp,$a3,$inp
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
vshd $X[0],$X[1],$X[0]
vshd $X[1],$t2,$X[1]
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
vshd $t2,$t3,$t2
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
vshd $t3,$a0,$t3
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<=(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
for (;$i<(128/4-1);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
vshd $t[1],$t[2],$t[1]
___
push(@t,shift(@t));
}
$code.=<<___;
b L\$collected_pa1
stw $t[0],`-$XOFF+$i*4`(%sp)
___
}
$code.=<<___;
L\$aligned_pa1
ldw `0*4`($inp),$X[0]
ldw `1*4`($inp),$X[1]
ldw `2*4`($inp),$t2
ldw `3*4`($inp),$t3
ldw `4*4`($inp),$a0
ldw `5*4`($inp),$a1
ldw `6*4`($inp),$a2
ldw `7*4`($inp),$a3
stw $X[0],`-$XOFF+0*4`(%sp)
ldw `8*4`($inp),$t0
stw $X[1],`-$XOFF+1*4`(%sp)
ldw `9*4`($inp),$t1
___
{
my @t=($t2,$t3,$a0,$a1,$a2,$a3,$t0,$t1);
for ($i=2;$i<(128/4-8);$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
ldw `(8+$i)*4`($inp),$t[0]
___
push(@t,shift(@t));
}
for (;$i<128/4;$i++) {
$code.=<<___;
stw $t[0],`-$XOFF+$i*4`(%sp)
___
push(@t,shift(@t));
}
$code.="L\$collected_pa1\n";
}
for($i=0;$i<16;$i++) { &ROUND_00_15_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.="L\$rounds_pa1\n";
for(;$i<32;$i++) { &ROUND_16_xx_pa1($i,@V); unshift(@V,pop(@V)); unshift(@V,pop(@V)); }
$code.=<<___;
$POP `-$FRAME_MARKER-2*$SIZE_T`(%sp),$ctx ; restore arguments
$POP `-$FRAME_MARKER-3*$SIZE_T`(%sp),$inp
$POP `-$FRAME_MARKER-4*$SIZE_T`(%sp),$num
ldo `-$rounds*$SZ`($Tbl),$Tbl ; rewind $Tbl
ldw `0*4`($ctx),$t1 ; update context
ldw `1*4`($ctx),$t0
ldw `2*4`($ctx),$t3
ldw `3*4`($ctx),$t2
ldw `4*4`($ctx),$a1
ldw `5*4`($ctx),$a0
ldw `6*4`($ctx),$a3
add $t0,$Alo,$Alo
ldw `7*4`($ctx),$a2
addc $t1,$Ahi,$Ahi
ldw `8*4`($ctx),$t1
add $t2,$Blo,$Blo
ldw `9*4`($ctx),$t0
addc $t3,$Bhi,$Bhi
ldw `10*4`($ctx),$t3
add $a0,$Clo,$Clo
ldw `11*4`($ctx),$t2
addc $a1,$Chi,$Chi
ldw `12*4`($ctx),$a1
add $a2,$Dlo,$Dlo
ldw `13*4`($ctx),$a0
addc $a3,$Dhi,$Dhi
ldw `14*4`($ctx),$a3
add $t0,$Elo,$Elo
ldw `15*4`($ctx),$a2
addc $t1,$Ehi,$Ehi
stw $Ahi,`0*4`($ctx)
add $t2,$Flo,$Flo
stw $Alo,`1*4`($ctx)
addc $t3,$Fhi,$Fhi
stw $Bhi,`2*4`($ctx)
add $a0,$Glo,$Glo
stw $Blo,`3*4`($ctx)
addc $a1,$Ghi,$Ghi
stw $Chi,`4*4`($ctx)
add $a2,$Hlo,$Hlo
stw $Clo,`5*4`($ctx)
addc $a3,$Hhi,$Hhi
stw $Dhi,`6*4`($ctx)
ldo `16*$SZ`($inp),$inp ; advance $inp
stw $Dlo,`7*4`($ctx)
stw $Ehi,`8*4`($ctx)
stw $Elo,`9*4`($ctx)
stw $Fhi,`10*4`($ctx)
stw $Flo,`11*4`($ctx)
stw $Ghi,`12*4`($ctx)
stw $Glo,`13*4`($ctx)
stw $Hhi,`14*4`($ctx)
comb,= $inp,$num,L\$done
stw $Hlo,`15*4`($ctx)
b L\$oop_pa1
$PUSH $inp,`-$FRAME_MARKER-3*$SIZE_T`(%sp) ; save $inp
L\$done
___
}}
$code.=<<___;
$POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue
$POP `-$FRAME+1*$SIZE_T`(%sp),%r4
$POP `-$FRAME+2*$SIZE_T`(%sp),%r5
$POP `-$FRAME+3*$SIZE_T`(%sp),%r6
$POP `-$FRAME+4*$SIZE_T`(%sp),%r7
$POP `-$FRAME+5*$SIZE_T`(%sp),%r8
$POP `-$FRAME+6*$SIZE_T`(%sp),%r9
$POP `-$FRAME+7*$SIZE_T`(%sp),%r10
$POP `-$FRAME+8*$SIZE_T`(%sp),%r11
$POP `-$FRAME+9*$SIZE_T`(%sp),%r12
$POP `-$FRAME+10*$SIZE_T`(%sp),%r13
$POP `-$FRAME+11*$SIZE_T`(%sp),%r14
$POP `-$FRAME+12*$SIZE_T`(%sp),%r15
$POP `-$FRAME+13*$SIZE_T`(%sp),%r16
$POP `-$FRAME+14*$SIZE_T`(%sp),%r17
$POP `-$FRAME+15*$SIZE_T`(%sp),%r18
bv (%r2)
.EXIT
$POPMB -$FRAME(%sp),%r3
.PROCEND
.STRINGZ "SHA`64*$SZ` block transform for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>"
___
# Explicitly encode PA-RISC 2.0 instructions used in this module, so
# that it can be compiled with .LEVEL 1.0. It should be noted that I
# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0
# directive...
my $ldd = sub {
my ($mod,$args) = @_;
my $orig = "ldd$mod\t$args";
if ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 3 suffices
{ my $opcode=(0x14<<26)|($2<<21)|($3<<16)|(($1&0x1FF8)<<1)|(($1>>13)&1);
$opcode|=(1<<3) if ($mod =~ /^,m/);
$opcode|=(1<<2) if ($mod =~ /^,mb/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $std = sub {
my ($mod,$args) = @_;
my $orig = "std$mod\t$args";
if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices
{ my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $extrd = sub {
my ($mod,$args) = @_;
my $orig = "extrd$mod\t$args";
# I only have ",u" completer, it's implicitly encoded...
if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15
{ my $opcode=(0x36<<26)|($1<<21)|($4<<16);
my $len=32-$3;
$opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos
$opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12
{ my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9);
my $len=32-$2;
$opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len
$opcode |= (1<<13) if ($mod =~ /,\**=/);
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
else { "\t".$orig; }
};
my $shrpd = sub {
my ($mod,$args) = @_;
my $orig = "shrpd$mod\t$args";
if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14
{ my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4;
my $cpos=63-$3;
$opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa
sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig;
}
elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11
{ sprintf "\t.WORD\t0x%08x\t; %s",
(0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig;
}
else { "\t".$orig; }
};
sub assemble {
my ($mnemonic,$mod,$args)=@_;
my $opcode = eval("\$$mnemonic");
ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args";
}
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/ge;
s/shd\s+(%r[0-9]+),(%r[0-9]+),([0-9]+)/
$3>31 ? sprintf("shd\t%$2,%$1,%d",$3-32) # rotation for >=32
: sprintf("shd\t%$1,%$2,%d",$3)/e or
# translate made up instructons: _ror, _shr, _align, _shl
s/_ror(\s+)(%r[0-9]+),/
($SZ==4 ? "shd" : "shrpd")."$1$2,$2,"/e or
s/_shr(\s+%r[0-9]+),([0-9]+),/
$SZ==4 ? sprintf("extru%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("extrd,u%s,%d,%d,",$1,63-$2,64-$2)/e or
s/_align(\s+%r[0-9]+,%r[0-9]+),/
($SZ==4 ? "vshd$1," : "shrpd$1,%sar,")/e or
s/_shl(\s+%r[0-9]+),([0-9]+),/
$SIZE_T==4 ? sprintf("zdep%s,%d,%d,",$1,31-$2,32-$2)
: sprintf("depd,z%s,%d,%d,",$1,63-$2,64-$2)/e;
s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($SIZE_T==4);
s/cmpb,\*/comb,/ if ($SIZE_T==4);
print $_,"\n";
}
close STDOUT;