aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont.
Improve CBC decrypt and CTR by ~13/16%, which adds up to ~25/33% improvement over "pre-Silvermont" version. [Add performance table to aesni-x86.pl].
This commit is contained in:
parent
385b348666
commit
5599c7331b
@ -43,6 +43,17 @@
|
|||||||
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
|
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
|
||||||
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
|
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# Current large-block performance in cycles per byte processed with
|
||||||
|
# 128-bit key (less is better).
|
||||||
|
#
|
||||||
|
# CBC en-/decrypt CTR XTS ECB
|
||||||
|
# Westmere 3.77/1.37 1.37 1.52 1.27
|
||||||
|
# * Bridge 5.07/0.98 0.99 1.09 0.91
|
||||||
|
# Haswell 4.44/0.80 0.97 1.03 0.72
|
||||||
|
# Atom 5.77/3.56 3.67 4.03 3.46
|
||||||
|
# Bulldozer 5.80/0.98 1.05 1.24 0.93
|
||||||
|
|
||||||
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
||||||
# generates drop-in replacement for
|
# generates drop-in replacement for
|
||||||
# crypto/aes/asm/aes-586.pl:-)
|
# crypto/aes/asm/aes-586.pl:-)
|
||||||
|
@ -158,25 +158,19 @@
|
|||||||
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
# Current large-block performance in cycles per byte processed with
|
||||||
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
|
# 128-bit key (less is better).
|
||||||
# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
|
#
|
||||||
# instruction latency is 9 cycles and that they can be issued every
|
# CBC en-/decrypt CTR XTS ECB
|
||||||
# cycle.
|
# Westmere 3.77/1.25 1.25 1.25 1.26
|
||||||
|
# * Bridge 5.07/0.74 0.75 0.90 0.85
|
||||||
######################################################################
|
# Haswell 4.44/0.63 0.63 0.73 0.63
|
||||||
# Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC
|
# Atom 5.75/3.54 3.56 4.12 3.87(*)
|
||||||
# decrypt, CTR and ECB, 0.73 in XTS.
|
# Bulldozer 5.77/0.70 0.72 0.90 0.70
|
||||||
|
#
|
||||||
######################################################################
|
# (*) Atom ECB result is suboptimal because of penalties incurred
|
||||||
# Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt,
|
# by operations on %xmm8-15. As ECB is not considered
|
||||||
# 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable
|
# critical, nothing was done to mitigate the problem.
|
||||||
# modes [other than XTS] are actually suboptimal, because of penalties
|
|
||||||
# incurred by operations on %xmm8-15, which are inevitable with such
|
|
||||||
# high instruction interleave factors. This means that performance can
|
|
||||||
# be improved by decreasing the interleave factor, but then it would
|
|
||||||
# negatively affect other platforms in relatively larger degree.
|
|
||||||
# Run-time detection would solve the dilemma...
|
|
||||||
|
|
||||||
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
||||||
# generates drop-in replacement for
|
# generates drop-in replacement for
|
||||||
@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
|
|||||||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||||||
|
|
||||||
$code=".text\n";
|
$code=".text\n";
|
||||||
|
$code.=".extern OPENSSL_ia32cap_P\n";
|
||||||
|
|
||||||
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
|
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
|
||||||
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
|
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
|
||||||
@ -1119,7 +1114,9 @@ $code.=<<___;
|
|||||||
lea 7($ctr),%r9
|
lea 7($ctr),%r9
|
||||||
mov %r10d,0x60+12(%rsp)
|
mov %r10d,0x60+12(%rsp)
|
||||||
bswap %r9d
|
bswap %r9d
|
||||||
|
mov OPENSSL_ia32cap_P+4(%rip),%r10d
|
||||||
xor $key0,%r9d
|
xor $key0,%r9d
|
||||||
|
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
|
||||||
mov %r9d,0x70+12(%rsp)
|
mov %r9d,0x70+12(%rsp)
|
||||||
|
|
||||||
$movkey 0x10($key),$rndkey1
|
$movkey 0x10($key),$rndkey1
|
||||||
@ -1130,10 +1127,104 @@ $code.=<<___;
|
|||||||
cmp \$8,$len
|
cmp \$8,$len
|
||||||
jb .Lctr32_tail
|
jb .Lctr32_tail
|
||||||
|
|
||||||
|
sub \$6,$len
|
||||||
|
cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
|
||||||
|
je .Lctr32_6x
|
||||||
|
|
||||||
lea 0x80($key),$key # size optimization
|
lea 0x80($key),$key # size optimization
|
||||||
sub \$8,$len
|
sub \$2,$len
|
||||||
jmp .Lctr32_loop8
|
jmp .Lctr32_loop8
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Lctr32_6x:
|
||||||
|
shl \$4,$rounds
|
||||||
|
mov \$48,$rnds_
|
||||||
|
bswap $key0
|
||||||
|
lea 32($key,$rounds),$key # end of key schedule
|
||||||
|
sub %rax,%r10 # twisted $rounds
|
||||||
|
jmp .Lctr32_loop6
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Lctr32_loop6:
|
||||||
|
add \$6,$ctr
|
||||||
|
$movkey -48($key,$rnds_),$rndkey0
|
||||||
|
aesenc $rndkey1,$inout0
|
||||||
|
mov $ctr,%eax
|
||||||
|
xor $key0,%eax
|
||||||
|
aesenc $rndkey1,$inout1
|
||||||
|
movbe %eax,`0x00+12`(%rsp)
|
||||||
|
lea 1($ctr),%eax
|
||||||
|
aesenc $rndkey1,$inout2
|
||||||
|
xor $key0,%eax
|
||||||
|
movbe %eax,`0x10+12`(%rsp)
|
||||||
|
aesenc $rndkey1,$inout3
|
||||||
|
lea 2($ctr),%eax
|
||||||
|
xor $key0,%eax
|
||||||
|
aesenc $rndkey1,$inout4
|
||||||
|
movbe %eax,`0x20+12`(%rsp)
|
||||||
|
lea 3($ctr),%eax
|
||||||
|
aesenc $rndkey1,$inout5
|
||||||
|
$movkey -32($key,$rnds_),$rndkey1
|
||||||
|
xor $key0,%eax
|
||||||
|
|
||||||
|
aesenc $rndkey0,$inout0
|
||||||
|
movbe %eax,`0x30+12`(%rsp)
|
||||||
|
lea 4($ctr),%eax
|
||||||
|
aesenc $rndkey0,$inout1
|
||||||
|
xor $key0,%eax
|
||||||
|
movbe %eax,`0x40+12`(%rsp)
|
||||||
|
aesenc $rndkey0,$inout2
|
||||||
|
lea 5($ctr),%eax
|
||||||
|
xor $key0,%eax
|
||||||
|
aesenc $rndkey0,$inout3
|
||||||
|
movbe %eax,`0x50+12`(%rsp)
|
||||||
|
mov %r10,%rax # mov $rnds_,$rounds
|
||||||
|
aesenc $rndkey0,$inout4
|
||||||
|
aesenc $rndkey0,$inout5
|
||||||
|
$movkey -16($key,$rnds_),$rndkey0
|
||||||
|
|
||||||
|
call .Lenc_loop6
|
||||||
|
|
||||||
|
movdqu ($inp),$inout6
|
||||||
|
movdqu 0x10($inp),$inout7
|
||||||
|
movdqu 0x20($inp),$in0
|
||||||
|
movdqu 0x30($inp),$in1
|
||||||
|
movdqu 0x40($inp),$in2
|
||||||
|
movdqu 0x50($inp),$in3
|
||||||
|
lea 0x60($inp),$inp
|
||||||
|
$movkey -64($key,$rnds_),$rndkey1
|
||||||
|
pxor $inout0,$inout6
|
||||||
|
movaps 0x00(%rsp),$inout0
|
||||||
|
pxor $inout1,$inout7
|
||||||
|
movaps 0x10(%rsp),$inout1
|
||||||
|
pxor $inout2,$in0
|
||||||
|
movaps 0x20(%rsp),$inout2
|
||||||
|
pxor $inout3,$in1
|
||||||
|
movaps 0x30(%rsp),$inout3
|
||||||
|
pxor $inout4,$in2
|
||||||
|
movaps 0x40(%rsp),$inout4
|
||||||
|
pxor $inout5,$in3
|
||||||
|
movaps 0x50(%rsp),$inout5
|
||||||
|
movdqu $inout6,($out)
|
||||||
|
movdqu $inout7,0x10($out)
|
||||||
|
movdqu $in0,0x20($out)
|
||||||
|
movdqu $in1,0x30($out)
|
||||||
|
movdqu $in2,0x40($out)
|
||||||
|
movdqu $in3,0x50($out)
|
||||||
|
lea 0x60($out),$out
|
||||||
|
|
||||||
|
sub \$6,$len
|
||||||
|
jnc .Lctr32_loop6
|
||||||
|
|
||||||
|
add \$6,$len
|
||||||
|
jz .Lctr32_done
|
||||||
|
|
||||||
|
lea -48($rnds_),$rounds
|
||||||
|
lea -80($key,$rnds_),$key # restore $key
|
||||||
|
neg $rounds
|
||||||
|
shr \$4,$rounds # restore $rounds
|
||||||
|
jmp .Lctr32_tail
|
||||||
|
|
||||||
.align 32
|
.align 32
|
||||||
.Lctr32_loop8:
|
.Lctr32_loop8:
|
||||||
add \$8,$ctr
|
add \$8,$ctr
|
||||||
@ -2455,10 +2546,15 @@ $code.=<<___;
|
|||||||
movdqa $inout3,$in3
|
movdqa $inout3,$in3
|
||||||
movdqu 0x50($inp),$inout5
|
movdqu 0x50($inp),$inout5
|
||||||
movdqa $inout4,$in4
|
movdqa $inout4,$in4
|
||||||
|
mov OPENSSL_ia32cap_P+4(%rip),%r9d
|
||||||
cmp \$0x70,$len
|
cmp \$0x70,$len
|
||||||
jbe .Lcbc_dec_six_or_seven
|
jbe .Lcbc_dec_six_or_seven
|
||||||
|
|
||||||
sub \$0x70,$len
|
and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
|
||||||
|
sub \$0x50,$len
|
||||||
|
cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
|
||||||
|
je .Lcbc_dec_loop6_enter
|
||||||
|
sub \$0x20,$len
|
||||||
lea 0x70($key),$key # size optimization
|
lea 0x70($key),$key # size optimization
|
||||||
jmp .Lcbc_dec_loop8_enter
|
jmp .Lcbc_dec_loop8_enter
|
||||||
.align 16
|
.align 16
|
||||||
@ -2638,6 +2734,51 @@ $code.=<<___;
|
|||||||
movdqa $inout6,$inout0
|
movdqa $inout6,$inout0
|
||||||
jmp .Lcbc_dec_tail_collected
|
jmp .Lcbc_dec_tail_collected
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.Lcbc_dec_loop6:
|
||||||
|
movups $inout5,($out)
|
||||||
|
lea 0x10($out),$out
|
||||||
|
movdqu 0x00($inp),$inout0 # load input
|
||||||
|
movdqu 0x10($inp),$inout1
|
||||||
|
movdqa $inout0,$in0
|
||||||
|
movdqu 0x20($inp),$inout2
|
||||||
|
movdqa $inout1,$in1
|
||||||
|
movdqu 0x30($inp),$inout3
|
||||||
|
movdqa $inout2,$in2
|
||||||
|
movdqu 0x40($inp),$inout4
|
||||||
|
movdqa $inout3,$in3
|
||||||
|
movdqu 0x50($inp),$inout5
|
||||||
|
movdqa $inout4,$in4
|
||||||
|
.Lcbc_dec_loop6_enter:
|
||||||
|
lea 0x60($inp),$inp
|
||||||
|
movdqa $inout5,$inout6
|
||||||
|
|
||||||
|
call _aesni_decrypt6
|
||||||
|
|
||||||
|
pxor $iv,$inout0 # ^= IV
|
||||||
|
movdqa $inout6,$iv
|
||||||
|
pxor $in0,$inout1
|
||||||
|
movdqu $inout0,($out)
|
||||||
|
pxor $in1,$inout2
|
||||||
|
movdqu $inout1,0x10($out)
|
||||||
|
pxor $in2,$inout3
|
||||||
|
movdqu $inout2,0x20($out)
|
||||||
|
pxor $in3,$inout4
|
||||||
|
mov $key_,$key
|
||||||
|
movdqu $inout3,0x30($out)
|
||||||
|
pxor $in4,$inout5
|
||||||
|
mov $rnds_,$rounds
|
||||||
|
movdqu $inout4,0x40($out)
|
||||||
|
lea 0x50($out),$out
|
||||||
|
sub \$0x60,$len
|
||||||
|
ja .Lcbc_dec_loop6
|
||||||
|
|
||||||
|
movdqa $inout5,$inout0
|
||||||
|
add \$0x50,$len
|
||||||
|
jle .Lcbc_dec_tail_collected
|
||||||
|
movups $inout5,($out)
|
||||||
|
lea 0x10($out),$out
|
||||||
|
|
||||||
.Lcbc_dec_tail:
|
.Lcbc_dec_tail:
|
||||||
movups ($inp),$inout0
|
movups ($inp),$inout0
|
||||||
sub \$0x10,$len
|
sub \$0x10,$len
|
||||||
@ -3360,8 +3501,14 @@ sub aesni {
|
|||||||
return $line;
|
return $line;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub movbe {
|
||||||
|
".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
|
||||||
|
}
|
||||||
|
|
||||||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||||
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
|
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
|
||||||
|
#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
|
||||||
|
$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
|
||||||
|
|
||||||
print $code;
|
print $code;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user