s390x assembler pack: tune-up and support for new z196 hardware.
This commit is contained in:
parent
8aa6cff40f
commit
0ab8fd58e1
@ -70,6 +70,18 @@
|
||||
# remains z/Architecture specific. On z990 it was measured to perform
|
||||
# 2x better than code generated by gcc 4.3.
|
||||
|
||||
# December 2010.
|
||||
#
|
||||
# Add support for z196 "cipher message with counter" instruction.
|
||||
# Note however that it's disengaged, because it was measured to
|
||||
# perform ~12% worse than vanilla km-based code...
|
||||
|
||||
# February 2011.
|
||||
#
|
||||
# Add AES_xts_[en|de]crypt. This includes support for z196
|
||||
# km-xts-aes instructions, which deliver ~70% improvement at 8KB
|
||||
# block size over vanilla km-based code.
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
@ -268,7 +280,7 @@ $code.=<<___;
|
||||
.type _s390x_AES_encrypt,\@function
|
||||
.align 16
|
||||
_s390x_AES_encrypt:
|
||||
st${g} $ra,`$stdframe-$SIZE_T`($sp)
|
||||
st${g} $ra,15*$SIZE_T($sp)
|
||||
x $s0,0($key)
|
||||
x $s1,4($key)
|
||||
x $s2,8($key)
|
||||
@ -432,7 +444,7 @@ _s390x_AES_encrypt:
|
||||
or $s2,$i3
|
||||
or $s3,$t3
|
||||
|
||||
l${g} $ra,`$stdframe-$SIZE_T`($sp)
|
||||
l${g} $ra,15*$SIZE_T($sp)
|
||||
xr $s0,$t0
|
||||
xr $s1,$t2
|
||||
x $s2,24($key)
|
||||
@ -594,7 +606,7 @@ $code.=<<___;
|
||||
.type _s390x_AES_decrypt,\@function
|
||||
.align 16
|
||||
_s390x_AES_decrypt:
|
||||
st${g} $ra,`$stdframe-$SIZE_T`($sp)
|
||||
st${g} $ra,15*$SIZE_T($sp)
|
||||
x $s0,0($key)
|
||||
x $s1,4($key)
|
||||
x $s2,8($key)
|
||||
@ -738,7 +750,7 @@ _s390x_AES_decrypt:
|
||||
nr $i1,$mask
|
||||
nr $i2,$mask
|
||||
|
||||
l${g} $ra,`$stdframe-$SIZE_T`($sp)
|
||||
l${g} $ra,15*$SIZE_T($sp)
|
||||
or $s1,$t1
|
||||
l $t0,16($key)
|
||||
l $t1,20($key)
|
||||
@ -1164,7 +1176,8 @@ $code.=<<___;
|
||||
.size AES_set_decrypt_key,.-AES_set_decrypt_key
|
||||
___
|
||||
|
||||
#void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
|
||||
########################################################################
|
||||
# void AES_cbc_encrypt(const unsigned char *in, unsigned char *out,
|
||||
# size_t length, const AES_KEY *key,
|
||||
# unsigned char *ivec, const int enc)
|
||||
{
|
||||
@ -1365,13 +1378,14 @@ $code.=<<___;
|
||||
.size AES_cbc_encrypt,.-AES_cbc_encrypt
|
||||
___
|
||||
}
|
||||
#void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
|
||||
########################################################################
|
||||
# void AES_ctr32_encrypt(const unsigned char *in, unsigned char *out,
|
||||
# size_t blocks, const AES_KEY *key,
|
||||
# const unsigned char *ivec)
|
||||
{
|
||||
my $inp="%r2";
|
||||
my $out="%r3";
|
||||
my $len="%r4";
|
||||
my $out="%r4"; # blocks and out are swapped
|
||||
my $len="%r3";
|
||||
my $key="%r5"; my $iv0="%r5";
|
||||
my $ivp="%r6";
|
||||
my $fp ="%r7";
|
||||
@ -1381,6 +1395,9 @@ $code.=<<___;
|
||||
.type AES_ctr32_encrypt,\@function
|
||||
.align 16
|
||||
AES_ctr32_encrypt:
|
||||
xgr %r3,%r4 # flip %r3 and %r4, $out and $len
|
||||
xgr %r4,%r3
|
||||
xgr %r3,%r4
|
||||
llgfr $len,$len # safe in ctr32 subroutine even in 64-bit case
|
||||
___
|
||||
$code.=<<___ if (!$softonly);
|
||||
@ -1415,20 +1432,75 @@ $code.=<<___ if (!$softonly);
|
||||
st${g} $fp,$SIZE_T($sp)
|
||||
|
||||
slgr $len,$fp
|
||||
brc 1,.Lctr32_hw_loop # not zero, no borrow
|
||||
brc 1,.Lctr32_hw_switch # not zero, no borrow
|
||||
algr $fp,$len # input is shorter than allocated buffer
|
||||
lghi $len,0
|
||||
st${g} $fp,$SIZE_T($sp)
|
||||
|
||||
.Lctr32_hw_loop:
|
||||
.Lctr32_hw_switch:
|
||||
___
|
||||
$code.=<<___ if (0); ######### kmctr code was measured to be ~12% slower
|
||||
larl $s0,OPENSSL_s390xcap_P
|
||||
lg $s0,8($s0)
|
||||
tmhh $s0,0x0004 # check for message_security-assist-4
|
||||
jz .Lctr32_km_loop
|
||||
|
||||
llgfr $s0,%r0
|
||||
lgr $s1,%r1
|
||||
lghi %r0,0
|
||||
la %r1,16($sp)
|
||||
.long 0xb92d2042 # kmctr %r4,%r2,%r2
|
||||
|
||||
llihh %r0,0x8000 # check if kmctr supports the function code
|
||||
srlg %r0,%r0,0($s0)
|
||||
ng %r0,16($sp)
|
||||
lgr %r0,$s0
|
||||
lgr %r1,$s1
|
||||
jz .Lctr32_km_loop
|
||||
|
||||
####### kmctr code
|
||||
algr $out,$inp # restore $out
|
||||
lgr $s1,$len # $s1 undertakes $len
|
||||
j .Lctr32_kmctr_loop
|
||||
.align 16
|
||||
.Lctr32_kmctr_loop:
|
||||
la $s2,16($sp)
|
||||
lgr $s3,$fp
|
||||
.Lctr32_hw_prepare:
|
||||
.Lctr32_kmctr_prepare:
|
||||
stg $iv0,0($s2)
|
||||
stg $ivp,8($s2)
|
||||
la $s2,16($s2)
|
||||
ahi $ivp,1 # 32-bit increment, preserves upper half
|
||||
brct $s3,.Lctr32_hw_prepare
|
||||
brct $s3,.Lctr32_kmctr_prepare
|
||||
|
||||
#la $inp,0($inp) # inp
|
||||
sllg $len,$fp,4 # len
|
||||
#la $out,0($out) # out
|
||||
la $s2,16($sp) # iv
|
||||
.long 0xb92da042 # kmctr $out,$s2,$inp
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
|
||||
slgr $s1,$fp
|
||||
brc 1,.Lctr32_kmctr_loop # not zero, no borrow
|
||||
algr $fp,$s1
|
||||
lghi $s1,0
|
||||
brc 4+1,.Lctr32_kmctr_loop # not zero
|
||||
|
||||
l${g} $sp,0($sp)
|
||||
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
||||
br $ra
|
||||
.align 16
|
||||
___
|
||||
$code.=<<___;
|
||||
.Lctr32_km_loop:
|
||||
la $s2,16($sp)
|
||||
lgr $s3,$fp
|
||||
.Lctr32_km_prepare:
|
||||
stg $iv0,0($s2)
|
||||
stg $ivp,8($s2)
|
||||
la $s2,16($s2)
|
||||
ahi $ivp,1 # 32-bit increment, preserves upper half
|
||||
brct $s3,.Lctr32_km_prepare
|
||||
|
||||
la $s0,16($sp) # inp
|
||||
sllg $s1,$fp,4 # len
|
||||
@ -1439,7 +1511,7 @@ $code.=<<___ if (!$softonly);
|
||||
la $s2,16($sp)
|
||||
lgr $s3,$fp
|
||||
slgr $s2,$inp
|
||||
.Lctr32_hw_xor:
|
||||
.Lctr32_km_xor:
|
||||
lg $s0,0($inp)
|
||||
lg $s1,8($inp)
|
||||
xg $s0,0($s2,$inp)
|
||||
@ -1447,22 +1519,22 @@ $code.=<<___ if (!$softonly);
|
||||
stg $s0,0($out,$inp)
|
||||
stg $s1,8($out,$inp)
|
||||
la $inp,16($inp)
|
||||
brct $s3,.Lctr32_hw_xor
|
||||
brct $s3,.Lctr32_km_xor
|
||||
|
||||
slgr $len,$fp
|
||||
brc 1,.Lctr32_hw_loop # not zero, no borrow
|
||||
brc 1,.Lctr32_km_loop # not zero, no borrow
|
||||
algr $fp,$len
|
||||
lghi $len,0
|
||||
brc 4+1,.Lctr32_hw_loop # not zero
|
||||
brc 4+1,.Lctr32_km_loop # not zero
|
||||
|
||||
l${g} $s0,0($sp)
|
||||
l${g} $s1,$SIZE_T($sp)
|
||||
la $s2,16($sp)
|
||||
.Lctr32_hw_zap:
|
||||
.Lctr32_km_zap:
|
||||
stg $s0,0($s2)
|
||||
stg $s0,8($s2)
|
||||
la $s2,16($s2)
|
||||
brct $s1,.Lctr32_hw_zap
|
||||
brct $s1,.Lctr32_km_zap
|
||||
|
||||
la $sp,0($s0)
|
||||
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
||||
@ -1472,12 +1544,12 @@ $code.=<<___ if (!$softonly);
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} $key,$ra,5*$SIZE_T($sp)
|
||||
sl${g}r $out,$inp
|
||||
sl${g}r $inp,$out
|
||||
larl $tbl,AES_Te
|
||||
llgf $t1,12($ivp)
|
||||
|
||||
.Lctr32_loop:
|
||||
stm${g} $inp,$len,2*$SIZE_T($sp)
|
||||
stm${g} $inp,$out,2*$SIZE_T($sp)
|
||||
llgf $s0,0($ivp)
|
||||
llgf $s1,4($ivp)
|
||||
llgf $s2,8($ivp)
|
||||
@ -1489,16 +1561,13 @@ $code.=<<___;
|
||||
|
||||
lm${g} $inp,$ivp,2*$SIZE_T($sp)
|
||||
llgf $t1,16*$SIZE_T($sp)
|
||||
x $s0,0($inp)
|
||||
x $s1,4($inp)
|
||||
x $s2,8($inp)
|
||||
x $s3,12($inp)
|
||||
st $s0,0($out,$inp)
|
||||
st $s1,4($out,$inp)
|
||||
st $s2,8($out,$inp)
|
||||
st $s3,12($out,$inp)
|
||||
x $s0,0($inp,$out)
|
||||
x $s1,4($inp,$out)
|
||||
x $s2,8($inp,$out)
|
||||
x $s3,12($inp,$out)
|
||||
stm $s0,$s3,0($out)
|
||||
|
||||
la $inp,16($inp)
|
||||
la $out,16($out)
|
||||
ahi $t1,1 # 32-bit increment
|
||||
brct $len,.Lctr32_loop
|
||||
|
||||
@ -1507,9 +1576,679 @@ $code.=<<___;
|
||||
.size AES_ctr32_encrypt,.-AES_ctr32_encrypt
|
||||
___
|
||||
}
|
||||
|
||||
########################################################################
|
||||
# void AES_xts_encrypt(const char *inp,char *out,size_t len,
|
||||
# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
|
||||
#
|
||||
{
|
||||
my $inp="%r2";
|
||||
my $out="%r4"; # len and out are swapped
|
||||
my $len="%r3";
|
||||
my $key1="%r5"; # $i1
|
||||
my $key2="%r6"; # $i2
|
||||
my $fp="%r7"; # $i3
|
||||
my $tweak=16*$SIZE_T+16; # or $stdframe-16, bottom of the frame...
|
||||
|
||||
$code.=<<___;
|
||||
.type _s390x_xts_km,\@function
|
||||
.align 16
|
||||
_s390x_xts_km:
|
||||
___
|
||||
$code.=<<___ if(0);
|
||||
llgfr $s0,%r0 # put aside the function code
|
||||
lghi $s1,0x7f
|
||||
nr $s1,%r0
|
||||
lghi %r0,0 # query capability vector
|
||||
la %r1,2*$SIZE_T($sp)
|
||||
.long 0xb92e0042 # km %r4,%r2
|
||||
llihh %r1,0x8000
|
||||
srlg %r1,%r1,32($s1) # check for 32+function code
|
||||
ng %r1,2*$SIZE_T($sp)
|
||||
lgr %r0,$s0 # restore the function code
|
||||
la %r1,0($key1) # restore $key1
|
||||
jz .Lxts_km_vanilla
|
||||
|
||||
lmg $i2,$i3,$tweak($sp) # put aside the tweak value
|
||||
algr $out,$inp
|
||||
|
||||
oill %r0,32 # switch to xts function code
|
||||
aghi $s1,-18 #
|
||||
sllg $s1,$s1,3 # (function code - 18)*8, 0 or 16
|
||||
la %r1,$tweak-16($sp)
|
||||
slgr %r1,$s1 # parameter block position
|
||||
lmg $s0,$s3,0($key1) # load 256 bits of key material,
|
||||
stmg $s0,$s3,0(%r1) # and copy it to parameter block.
|
||||
# yes, it contains junk and overlaps
|
||||
# with the tweak in 128-bit case.
|
||||
# it's done to avoid conditional
|
||||
# branch.
|
||||
stmg $i2,$i3,$tweak($sp) # "re-seat" the tweak value
|
||||
|
||||
.long 0xb92e0042 # km %r4,%r2
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
|
||||
lrvg $s0,$tweak+0($sp) # load the last tweak
|
||||
lrvg $s1,$tweak+8($sp)
|
||||
stmg %r0,%r3,$tweak-32(%r1) # wipe copy of the key
|
||||
|
||||
nill %r0,0xffdf # switch back to original function code
|
||||
la %r1,0($key1) # restore pointer to $key1
|
||||
slgr $out,$inp
|
||||
|
||||
llgc $len,2*$SIZE_T-1($sp)
|
||||
nill $len,0x0f # $len%=16
|
||||
br $ra
|
||||
|
||||
.align 16
|
||||
.Lxts_km_vanilla:
|
||||
___
|
||||
$code.=<<___;
|
||||
# prepare and allocate stack frame at the top of 4K page
|
||||
# with 1K reserved for eventual signal handling
|
||||
lghi $s0,-1024-256-16# guarantee at least 256-bytes buffer
|
||||
lghi $s1,-4096
|
||||
algr $s0,$sp
|
||||
lgr $fp,$sp
|
||||
ngr $s0,$s1 # align at page boundary
|
||||
slgr $fp,$s0 # total buffer size
|
||||
lgr $s2,$sp
|
||||
lghi $s1,1024+16 # sl[g]fi is extended-immediate facility
|
||||
slgr $fp,$s1 # deduct reservation to get usable buffer size
|
||||
# buffer size is at lest 256 and at most 3072+256-16
|
||||
|
||||
la $sp,1024($s0) # alloca
|
||||
nill $fp,0xfff0 # round to 16*n
|
||||
st${g} $s2,0($sp) # back-chain
|
||||
nill $len,0xfff0 # redundant
|
||||
st${g} $fp,$SIZE_T($sp)
|
||||
|
||||
slgr $len,$fp
|
||||
brc 1,.Lxts_km_go # not zero, no borrow
|
||||
algr $fp,$len # input is shorter than allocated buffer
|
||||
lghi $len,0
|
||||
st${g} $fp,$SIZE_T($sp)
|
||||
|
||||
.Lxts_km_go:
|
||||
lrvg $s0,$tweak+0($s2) # load the tweak value in little-endian
|
||||
lrvg $s1,$tweak+8($s2)
|
||||
|
||||
la $s2,16($sp) # vector of ascending tweak values
|
||||
slgr $s2,$inp
|
||||
srlg $s3,$fp,4
|
||||
j .Lxts_km_start
|
||||
|
||||
.Lxts_km_loop:
|
||||
la $s2,16($sp)
|
||||
slgr $s2,$inp
|
||||
srlg $s3,$fp,4
|
||||
.Lxts_km_prepare:
|
||||
lghi $i1,0x87
|
||||
srag $i2,$s1,63 # broadcast upper bit
|
||||
ngr $i1,$i2 # rem
|
||||
srlg $i2,$s0,63 # carry bit from lower half
|
||||
sllg $s0,$s0,1
|
||||
sllg $s1,$s1,1
|
||||
xgr $s0,$i1
|
||||
ogr $s1,$i2
|
||||
.Lxts_km_start:
|
||||
lrvgr $i1,$s0 # flip byte order
|
||||
lrvgr $i2,$s1
|
||||
stg $i1,0($s2,$inp)
|
||||
stg $i2,8($s2,$inp)
|
||||
xg $i1,0($inp)
|
||||
xg $i2,8($inp)
|
||||
stg $i1,0($out,$inp)
|
||||
stg $i2,8($out,$inp)
|
||||
la $inp,16($inp)
|
||||
brct $s3,.Lxts_km_prepare
|
||||
|
||||
slgr $inp,$fp # rewind $inp
|
||||
la $s2,0($out,$inp)
|
||||
lgr $s3,$fp
|
||||
.long 0xb92e00aa # km $s2,$s2
|
||||
brc 1,.-4 # pay attention to "partial completion"
|
||||
|
||||
la $s2,16($sp)
|
||||
slgr $s2,$inp
|
||||
srlg $s3,$fp,4
|
||||
.Lxts_km_xor:
|
||||
lg $i1,0($out,$inp)
|
||||
lg $i2,8($out,$inp)
|
||||
xg $i1,0($s2,$inp)
|
||||
xg $i2,8($s2,$inp)
|
||||
stg $i1,0($out,$inp)
|
||||
stg $i2,8($out,$inp)
|
||||
la $inp,16($inp)
|
||||
brct $s3,.Lxts_km_xor
|
||||
|
||||
slgr $len,$fp
|
||||
brc 1,.Lxts_km_loop # not zero, no borrow
|
||||
algr $fp,$len
|
||||
lghi $len,0
|
||||
brc 4+1,.Lxts_km_loop # not zero
|
||||
|
||||
l${g} $i1,0($sp) # back-chain
|
||||
llgf $fp,`2*$SIZE_T-4`($sp) # bytes used
|
||||
la $i2,16($sp)
|
||||
srlg $fp,$fp,4
|
||||
.Lxts_km_zap:
|
||||
stg $i1,0($i2)
|
||||
stg $i1,8($i2)
|
||||
la $i2,16($i2)
|
||||
brct $fp,.Lxts_km_zap
|
||||
|
||||
la $sp,0($i1)
|
||||
llgc $len,2*$SIZE_T-1($i1)
|
||||
nill $len,0x0f # $len%=16
|
||||
bzr $ra
|
||||
|
||||
# generate one more tweak...
|
||||
lghi $i1,0x87
|
||||
srag $i2,$s1,63 # broadcast upper bit
|
||||
ngr $i1,$i2 # rem
|
||||
srlg $i2,$s0,63 # carry bit from lower half
|
||||
sllg $s0,$s0,1
|
||||
sllg $s1,$s1,1
|
||||
xgr $s0,$i1
|
||||
ogr $s1,$i2
|
||||
|
||||
ltr $len,$len # clear zero flag
|
||||
br $ra
|
||||
.size _s390x_xts_km,.-_s390x_xts_km
|
||||
|
||||
.globl AES_xts_encrypt
|
||||
.type AES_xts_encrypt,\@function
|
||||
.align 16
|
||||
AES_xts_encrypt:
|
||||
xgr %r3,%r4 # flip %r3 and %r4, $out and $len
|
||||
xgr %r4,%r3
|
||||
xgr %r3,%r4
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
llgfr $len,$len
|
||||
___
|
||||
$code.=<<___;
|
||||
st${g} $len,1*$SIZE_T($sp) # save copy of $len
|
||||
srag $len,$len,4 # formally wrong, because it expands
|
||||
# sign byte, but who can afford asking
|
||||
# to process more than 2^63-1 bytes?
|
||||
# I use it, because it sets condition
|
||||
# code...
|
||||
bcr 8,$ra # abort if zero (i.e. less than 16)
|
||||
___
|
||||
$code.=<<___ if (!$softonly);
|
||||
llgf %r0,240($key2)
|
||||
lhi %r1,16
|
||||
clr %r0,%r1
|
||||
jl .Lxts_enc_software
|
||||
|
||||
stm${g} %r6,$s3,6*$SIZE_T($sp)
|
||||
st${g} $ra,14*$SIZE_T($sp)
|
||||
|
||||
sllg $len,$len,4 # $len&=~15
|
||||
slgr $out,$inp
|
||||
|
||||
lrvg $s0,$stdframe($sp) # load secno
|
||||
lghi $s1,0
|
||||
la $s2,$tweak($sp)
|
||||
lghi $s3,16
|
||||
stmg $s0,$s1,0($s2)
|
||||
la %r1,0($key2) # $key2 is not needed anymore
|
||||
.long 0xb92e00aa # km $s2,$s2, generate the tweak
|
||||
brc 1,.-4 # can this happen?
|
||||
|
||||
l %r0,240($key1)
|
||||
la %r1,0($key1) # $key1 is not needed anymore
|
||||
bras $ra,_s390x_xts_km
|
||||
jz .Lxts_enc_km_done
|
||||
|
||||
aghi $inp,-16 # take one step back
|
||||
la $i3,0($out,$inp) # put aside real $out
|
||||
.Lxts_enc_km_steal:
|
||||
llgc $i1,16($inp)
|
||||
llgc $i2,0($out,$inp)
|
||||
stc $i1,0($out,$inp)
|
||||
stc $i2,16($out,$inp)
|
||||
la $inp,1($inp)
|
||||
brct $len,.Lxts_enc_km_steal
|
||||
|
||||
la $s2,0($i3)
|
||||
lghi $s3,16
|
||||
lrvgr $i1,$s0 # flip byte order
|
||||
lrvgr $i2,$s1
|
||||
xg $i1,0($s2)
|
||||
xg $i2,8($s2)
|
||||
stg $i1,0($s2)
|
||||
stg $i2,8($s2)
|
||||
.long 0xb92e00aa # km $s2,$s2
|
||||
brc 1,.-4 # can this happen?
|
||||
lrvgr $i1,$s0 # flip byte order
|
||||
lrvgr $i2,$s1
|
||||
xg $i1,0($i3)
|
||||
xg $i2,8($i3)
|
||||
stg $i1,0($i3)
|
||||
stg $i2,8($i3)
|
||||
|
||||
.Lxts_enc_km_done:
|
||||
l${g} $ra,14*$SIZE_T($sp)
|
||||
st${g} $sp,$tweak($sp) # wipe tweak
|
||||
st${g} $sp,$tweak($sp)
|
||||
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
||||
br $ra
|
||||
.align 16
|
||||
.Lxts_enc_software:
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r6,$ra,6*$SIZE_T($sp)
|
||||
|
||||
slgr $out,$inp
|
||||
|
||||
xgr $s0,$s0 # clear upper half
|
||||
xgr $s1,$s1
|
||||
lrv $s0,$stdframe+4($sp) # load secno
|
||||
lrv $s1,$stdframe+0($sp)
|
||||
xgr $s2,$s2
|
||||
xgr $s3,$s3
|
||||
stm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
la $key,0($key2)
|
||||
larl $tbl,AES_Te
|
||||
bras $ra,_s390x_AES_encrypt # generate the tweak
|
||||
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
stm $s0,$s3,$tweak($sp) # save the tweak
|
||||
j .Lxts_enc_enter
|
||||
|
||||
.align 16
|
||||
.Lxts_enc_loop:
|
||||
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
||||
lrvg $s3,$tweak+8($sp)
|
||||
lghi %r1,0x87
|
||||
srag %r0,$s3,63 # broadcast upper bit
|
||||
ngr %r1,%r0 # rem
|
||||
srlg %r0,$s1,63 # carry bit from lower half
|
||||
sllg $s1,$s1,1
|
||||
sllg $s3,$s3,1
|
||||
xgr $s1,%r1
|
||||
ogr $s3,%r0
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak+0($sp) # save the tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
stg $s3,$tweak+8($sp)
|
||||
llgfr $s3,$s3
|
||||
la $inp,16($inp) # $inp+=16
|
||||
.Lxts_enc_enter:
|
||||
x $s0,0($inp) # ^=*($inp)
|
||||
x $s1,4($inp)
|
||||
x $s2,8($inp)
|
||||
x $s3,12($inp)
|
||||
stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
|
||||
la $key,0($key1)
|
||||
bras $ra,_s390x_AES_encrypt
|
||||
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
x $s0,$tweak+0($sp) # ^=tweak
|
||||
x $s1,$tweak+4($sp)
|
||||
x $s2,$tweak+8($sp)
|
||||
x $s3,$tweak+12($sp)
|
||||
st $s0,0($out,$inp)
|
||||
st $s1,4($out,$inp)
|
||||
st $s2,8($out,$inp)
|
||||
st $s3,12($out,$inp)
|
||||
brct${g} $len,.Lxts_enc_loop
|
||||
|
||||
llgc $len,`2*$SIZE_T-1`($sp)
|
||||
nill $len,0x0f # $len%16
|
||||
jz .Lxts_enc_done
|
||||
|
||||
la $i3,0($inp,$out) # put aside real $out
|
||||
.Lxts_enc_steal:
|
||||
llgc %r0,16($inp)
|
||||
llgc %r1,0($out,$inp)
|
||||
stc %r0,0($out,$inp)
|
||||
stc %r1,16($out,$inp)
|
||||
la $inp,1($inp)
|
||||
brct $len,.Lxts_enc_steal
|
||||
la $out,0($i3) # restore real $out
|
||||
|
||||
# generate last tweak...
|
||||
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
||||
lrvg $s3,$tweak+8($sp)
|
||||
lghi %r1,0x87
|
||||
srag %r0,$s3,63 # broadcast upper bit
|
||||
ngr %r1,%r0 # rem
|
||||
srlg %r0,$s1,63 # carry bit from lower half
|
||||
sllg $s1,$s1,1
|
||||
sllg $s3,$s3,1
|
||||
xgr $s1,%r1
|
||||
ogr $s3,%r0
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak+0($sp) # save the tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
stg $s3,$tweak+8($sp)
|
||||
llgfr $s3,$s3
|
||||
|
||||
x $s0,0($out) # ^=*(inp)|stolen cipther-text
|
||||
x $s1,4($out)
|
||||
x $s2,8($out)
|
||||
x $s3,12($out)
|
||||
st${g} $out,4*$SIZE_T($sp)
|
||||
la $key,0($key1)
|
||||
bras $ra,_s390x_AES_encrypt
|
||||
l${g} $out,4*$SIZE_T($sp)
|
||||
x $s0,`$tweak+0`($sp) # ^=tweak
|
||||
x $s1,`$tweak+4`($sp)
|
||||
x $s2,`$tweak+8`($sp)
|
||||
x $s3,`$tweak+12`($sp)
|
||||
st $s0,0($out)
|
||||
st $s1,4($out)
|
||||
st $s2,8($out)
|
||||
st $s3,12($out)
|
||||
|
||||
.Lxts_enc_done:
|
||||
stg $sp,$tweak+0($sp) # wipe tweak
|
||||
stg $sp,$twesk+8($sp)
|
||||
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
||||
br $ra
|
||||
.size AES_xts_encrypt,.-AES_xts_encrypt
|
||||
___
|
||||
# void AES_xts_decrypt(const char *inp,char *out,size_t len,
|
||||
# const AES_KEY *key1, const AES_KEY *key2,u64 secno);
|
||||
#
|
||||
$code.=<<___;
|
||||
.globl AES_xts_decrypt
|
||||
.type AES_xts_decrypt,\@function
|
||||
.align 16
|
||||
AES_xts_decrypt:
|
||||
xgr %r3,%r4 # flip %r3 and %r4, $out and $len
|
||||
xgr %r4,%r3
|
||||
xgr %r3,%r4
|
||||
___
|
||||
$code.=<<___ if ($SIZE_T==4);
|
||||
llgfr $len,$len
|
||||
___
|
||||
$code.=<<___;
|
||||
st${g} $len,1*$SIZE_T($sp) # save copy of $len
|
||||
aghi $len,-16
|
||||
bcr 4,$ra # abort if less than zero. formally
|
||||
# wrong, because $len is unsigned,
|
||||
# but who can afford asking to
|
||||
# process more than 2^63-1 bytes?
|
||||
tmll $len,0x0f
|
||||
jnz .Lxts_dec_proceed
|
||||
aghi $len,16
|
||||
.Lxts_dec_proceed:
|
||||
___
|
||||
$code.=<<___ if (!$softonly);
|
||||
llgf %r0,240($key2)
|
||||
lhi %r1,16
|
||||
clr %r0,%r1
|
||||
jl .Lxts_dec_software
|
||||
|
||||
stm${g} %r6,$s3,6*$SIZE_T($sp)
|
||||
st${g} $ra,14*$SIZE_T($sp)
|
||||
|
||||
nill $len,0xfff0 # $len&=~15
|
||||
slgr $out,$inp
|
||||
|
||||
# generate the tweak value
|
||||
lrvg $s0,$stdframe($sp) # load secno
|
||||
lghi $s1,0
|
||||
la $s2,$tweak($sp)
|
||||
lghi $s3,16
|
||||
stg $s0,0($s2)
|
||||
stg $s1,8($s2)
|
||||
la %r1,0($key2) # $key2 is not needed past this point
|
||||
.long 0xb92e00aa # km $s2,$s2, generate the tweak
|
||||
brc 1,.-4 # can this happen?
|
||||
|
||||
l %r0,240($key1)
|
||||
la %r1,0($key1) # $key1 is not needed anymore
|
||||
|
||||
ltgr $len,$len
|
||||
jz .Lxts_dec_km_short
|
||||
bras $ra,_s390x_xts_km
|
||||
jz .Lxts_dec_km_done
|
||||
|
||||
lrvgr $s2,$s0 # make copy in reverse byte order
|
||||
lrvgr $s3,$s1
|
||||
j .Lxts_dec_km_2ndtweak
|
||||
|
||||
.Lxts_dec_km_short:
|
||||
llgc $len,`2*$SIZE_T-1`($sp)
|
||||
nill $len,0x0f # $len%=16
|
||||
lrvg $s0,$tweak+0($sp) # load the tweak
|
||||
lrvg $s1,$tweak+8($sp)
|
||||
lrvgr $s2,$s0 # make copy in reverse byte order
|
||||
lrvgr $s3,$s1
|
||||
|
||||
.Lxts_dec_km_2ndtweak:
|
||||
lghi $i1,0x87
|
||||
srag $i2,$s1,63 # broadcast upper bit
|
||||
ngr $i1,$i2 # rem
|
||||
srlg $i2,$s0,63 # carry bit from lower half
|
||||
sllg $s0,$s0,1
|
||||
sllg $s1,$s1,1
|
||||
xgr $s0,$i1
|
||||
ogr $s1,$i2
|
||||
lrvgr $i1,$s0 # flip byte order
|
||||
lrvgr $i2,$s1
|
||||
|
||||
xg $i1,0($inp)
|
||||
xg $i2,8($inp)
|
||||
stg $i1,0($out,$inp)
|
||||
stg $i2,8($out,$inp)
|
||||
la $i2,0($out,$inp)
|
||||
lghi $i3,16
|
||||
.long 0xb92e0066 # km $i2,$i2
|
||||
brc 1,.-4 # can this happen?
|
||||
lrvgr $i1,$s0
|
||||
lrvgr $i2,$s1
|
||||
xg $i1,0($out,$inp)
|
||||
xg $i2,8($out,$inp)
|
||||
stg $i1,0($out,$inp)
|
||||
stg $i2,8($out,$inp)
|
||||
|
||||
la $i3,0($out,$inp) # put aside real $out
|
||||
.Lxts_dec_km_steal:
|
||||
llgc $i1,16($inp)
|
||||
llgc $i2,0($out,$inp)
|
||||
stc $i1,0($out,$inp)
|
||||
stc $i2,16($out,$inp)
|
||||
la $inp,1($inp)
|
||||
brct $len,.Lxts_dec_km_steal
|
||||
|
||||
lgr $s0,$s2
|
||||
lgr $s1,$s3
|
||||
xg $s0,0($i3)
|
||||
xg $s1,8($i3)
|
||||
stg $s0,0($i3)
|
||||
stg $s1,8($i3)
|
||||
la $s0,0($i3)
|
||||
lghi $s1,16
|
||||
.long 0xb92e0088 # km $s0,$s0
|
||||
brc 1,.-4 # can this happen?
|
||||
xg $s2,0($i3)
|
||||
xg $s3,8($i3)
|
||||
stg $s2,0($i3)
|
||||
stg $s3,8($i3)
|
||||
.Lxts_dec_km_done:
|
||||
l${g} $ra,14*$SIZE_T($sp)
|
||||
st${g} $sp,$tweak($sp) # wipe tweak
|
||||
st${g} $sp,$tweak($sp)
|
||||
lm${g} %r6,$s3,6*$SIZE_T($sp)
|
||||
br $ra
|
||||
.align 16
|
||||
.Lxts_dec_software:
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r6,$ra,6*$SIZE_T($sp)
|
||||
|
||||
srlg $len,$len,4
|
||||
slgr $out,$inp
|
||||
|
||||
xgr $s0,$s0 # clear upper half
|
||||
xgr $s1,$s1
|
||||
lrv $s0,$stdframe+4($sp) # load secno
|
||||
lrv $s1,$stdframe+0($sp)
|
||||
xgr $s2,$s2
|
||||
xgr $s3,$s3
|
||||
stm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
la $key,0($key2)
|
||||
larl $tbl,AES_Te
|
||||
bras $ra,_s390x_AES_encrypt # generate the tweak
|
||||
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
larl $tbl,AES_Td
|
||||
lt${g}r $len,$len
|
||||
stm $s0,$s3,$tweak($sp) # save the tweak
|
||||
jz .Lxts_dec_short
|
||||
j .Lxts_dec_enter
|
||||
|
||||
.align 16
|
||||
.Lxts_dec_loop:
|
||||
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
||||
lrvg $s3,$tweak+8($sp)
|
||||
lghi %r1,0x87
|
||||
srag %r0,$s3,63 # broadcast upper bit
|
||||
ngr %r1,%r0 # rem
|
||||
srlg %r0,$s1,63 # carry bit from lower half
|
||||
sllg $s1,$s1,1
|
||||
sllg $s3,$s3,1
|
||||
xgr $s1,%r1
|
||||
ogr $s3,%r0
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak+0($sp) # save the tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
stg $s3,$tweak+8($sp)
|
||||
llgfr $s3,$s3
|
||||
.Lxts_dec_enter:
|
||||
x $s0,0($inp) # tweak^=*(inp)
|
||||
x $s1,4($inp)
|
||||
x $s2,8($inp)
|
||||
x $s3,12($inp)
|
||||
stm${g} %r2,%r3,2*$SIZE_T($sp) # only two registers are changing
|
||||
la $key,0($key1)
|
||||
bras $ra,_s390x_AES_decrypt
|
||||
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
x $s0,$tweak+0($sp) # ^=tweak
|
||||
x $s1,$tweak+4($sp)
|
||||
x $s2,$tweak+8($sp)
|
||||
x $s3,$tweak+12($sp)
|
||||
st $s0,0($out,$inp)
|
||||
st $s1,4($out,$inp)
|
||||
st $s2,8($out,$inp)
|
||||
st $s3,12($out,$inp)
|
||||
la $inp,16($inp)
|
||||
brct${g} $len,.Lxts_dec_loop
|
||||
|
||||
llgc $len,`2*$SIZE_T-1`($sp)
|
||||
nill $len,0x0f # $len%16
|
||||
jz .Lxts_dec_done
|
||||
|
||||
# generate pair of tweaks...
|
||||
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
||||
lrvg $s3,$tweak+8($sp)
|
||||
lghi %r1,0x87
|
||||
srag %r0,$s3,63 # broadcast upper bit
|
||||
ngr %r1,%r0 # rem
|
||||
srlg %r0,$s1,63 # carry bit from lower half
|
||||
sllg $s1,$s1,1
|
||||
sllg $s3,$s3,1
|
||||
xgr $s1,%r1
|
||||
ogr $s3,%r0
|
||||
lrvgr $i2,$s1 # flip byte order
|
||||
lrvgr $i3,$s3
|
||||
stmg $i2,$i3,$tweak($sp) # save the 1st tweak
|
||||
j .Lxts_dec_2ndtweak
|
||||
|
||||
.align 16
|
||||
.Lxts_dec_short:
|
||||
llgc $len,`2*$SIZE_T-1`($sp)
|
||||
nill $len,0x0f # $len%16
|
||||
lrvg $s1,$tweak+0($sp) # load the tweak in little-endian
|
||||
lrvg $s3,$tweak+8($sp)
|
||||
.Lxts_dec_2ndtweak:
|
||||
lghi %r1,0x87
|
||||
srag %r0,$s3,63 # broadcast upper bit
|
||||
ngr %r1,%r0 # rem
|
||||
srlg %r0,$s1,63 # carry bit from lower half
|
||||
sllg $s1,$s1,1
|
||||
sllg $s3,$s3,1
|
||||
xgr $s1,%r1
|
||||
ogr $s3,%r0
|
||||
lrvgr $s1,$s1 # flip byte order
|
||||
lrvgr $s3,$s3
|
||||
srlg $s0,$s1,32 # smash the tweak to 4x32-bits
|
||||
stg $s1,$tweak-16+0($sp) # save the 2nd tweak
|
||||
llgfr $s1,$s1
|
||||
srlg $s2,$s3,32
|
||||
stg $s3,$tweak-16+8($sp)
|
||||
llgfr $s3,$s3
|
||||
|
||||
x $s0,0($inp) # tweak_the_2nd^=*(inp)
|
||||
x $s1,4($inp)
|
||||
x $s2,8($inp)
|
||||
x $s3,12($inp)
|
||||
stm${g} %r2,%r3,2*$SIZE_T($sp)
|
||||
la $key,0($key1)
|
||||
bras $ra,_s390x_AES_decrypt
|
||||
lm${g} %r2,%r5,2*$SIZE_T($sp)
|
||||
x $s0,$tweak-16+0($sp) # ^=tweak_the_2nd
|
||||
x $s1,$tweak-16+4($sp)
|
||||
x $s2,$tweak-16+8($sp)
|
||||
x $s3,$tweak-16+12($sp)
|
||||
st $s0,0($out,$inp)
|
||||
st $s1,4($out,$inp)
|
||||
st $s2,8($out,$inp)
|
||||
st $s3,12($out,$inp)
|
||||
|
||||
la $i3,0($out,$inp) # put aside real $out
|
||||
.Lxts_dec_steal:
|
||||
llgc %r0,16($inp)
|
||||
llgc %r1,0($out,$inp)
|
||||
stc %r0,0($out,$inp)
|
||||
stc %r1,16($out,$inp)
|
||||
la $inp,1($inp)
|
||||
brct $len,.Lxts_dec_steal
|
||||
la $out,0($i3) # restore real $out
|
||||
|
||||
lm $s0,$s3,$tweak($sp) # load the 1st tweak
|
||||
x $s0,0($out) # tweak^=*(inp)|stolen cipher-text
|
||||
x $s1,4($out)
|
||||
x $s2,8($out)
|
||||
x $s3,12($out)
|
||||
st${g} $out,4*$SIZE_T($sp)
|
||||
la $key,0($key1)
|
||||
bras $ra,_s390x_AES_decrypt
|
||||
l${g} $out,4*$SIZE_T($sp)
|
||||
x $s0,$tweak+0($sp) # ^=tweak
|
||||
x $s1,$tweak+4($sp)
|
||||
x $s2,$tweak+8($sp)
|
||||
x $s3,$tweak+12($sp)
|
||||
st $s0,0($out)
|
||||
st $s1,4($out)
|
||||
st $s2,8($out)
|
||||
st $s3,12($out)
|
||||
stg $sp,$tweak-16+0($sp) # wipe 2nd tweak
|
||||
stg $sp,$tweak-16+8($sp)
|
||||
.Lxts_dec_done:
|
||||
stg $sp,$tweak+0($sp) # wipe tweak
|
||||
stg $sp,$twesk+8($sp)
|
||||
lm${g} %r6,$ra,6*$SIZE_T($sp)
|
||||
br $ra
|
||||
.size AES_xts_decrypt,.-AES_xts_decrypt
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
.comm OPENSSL_s390xcap_P,16,8
|
||||
.string "AES for s390x, CRYPTOGAMS by <appro\@openssl.org>"
|
||||
.comm OPENSSL_s390xcap_P,16,8
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
|
@ -41,8 +41,8 @@
|
||||
# processor, as long as it's "z-CPU". Latter implies that the code
|
||||
# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
|
||||
# is achieved by swapping words after 64-bit loads, follow _dswap-s.
|
||||
# On z990 it was measured to perform 2.6-2.2 times better, less for
|
||||
# longer keys...
|
||||
# On z990 it was measured to perform 2.6-2.2 times better than
|
||||
# compiler-generated code, less for longer keys...
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
@ -102,8 +102,8 @@ $code.=<<___ if ($flavour =~ /3[12]/);
|
||||
bnzr %r14 # if ($num&1) return 0;
|
||||
___
|
||||
$code.=<<___ if ($flavour !~ /3[12]/);
|
||||
cghi $num,128 #
|
||||
bhr %r14 # if($num>128) return 0;
|
||||
cghi $num,96 #
|
||||
bhr %r14 # if($num>96) return 0;
|
||||
___
|
||||
$code.=<<___;
|
||||
stm${g} %r3,%r15,3*$SIZE_T($sp)
|
||||
|
@ -28,6 +28,15 @@
|
||||
# remains z/Architecture specific. On z990 it was measured to perform
|
||||
# 2.8x better than 32-bit code generated by gcc 4.3.
|
||||
|
||||
# March 2011.
|
||||
#
|
||||
# Support for hardware KIMD-GHASH is verified to produce correct
|
||||
# result and therefore is engaged. On z196 it was measured to process
|
||||
# 8KB buffer ~7 faster than software implementation. It's not as
|
||||
# impressive for smaller buffer sizes and for smallest 16-bytes buffer
|
||||
# it's actually almost 2 times slower. Which is the reason why
|
||||
# KIMD-GHASH is not used in gcm_gmult_4bit.
|
||||
|
||||
$flavour = shift;
|
||||
|
||||
if ($flavour =~ /3[12]/) {
|
||||
@ -41,7 +50,7 @@ if ($flavour =~ /3[12]/) {
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
$softonly=1; # disable hardware support for now
|
||||
$softonly=0;
|
||||
|
||||
$Zhi="%r0";
|
||||
$Zlo="%r1";
|
||||
@ -70,7 +79,7 @@ $code.=<<___;
|
||||
.align 32
|
||||
gcm_gmult_4bit:
|
||||
___
|
||||
$code.=<<___ if(!$softonly);
|
||||
$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
|
||||
larl %r1,OPENSSL_s390xcap_P
|
||||
lg %r0,0(%r1)
|
||||
tmhl %r0,0x4000 # check for message-security-assist
|
||||
|
Loading…
x
Reference in New Issue
Block a user