aes/asm/aesni-x86[_64].pl: minor Atom-specific performance tweak.
This commit is contained in:
parent
47739161c6
commit
214368ffee
@ -207,12 +207,45 @@ sub aesni_generate1 # fully unrolled loop
|
|||||||
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
|
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
|
||||||
# utilization, i.e. when subroutine's throughput is virtually same as
|
# utilization, i.e. when subroutine's throughput is virtually same as
|
||||||
# of non-interleaved subroutine [for number of input blocks up to 3].
|
# of non-interleaved subroutine [for number of input blocks up to 3].
|
||||||
# This is why it makes no sense to implement 2x subroutine.
|
# This is why it originally made no sense to implement 2x subroutine.
|
||||||
# aes[enc|dec] latency in next processor generation is 8, but the
|
# But times change and it became appropriate to spend extra 192 bytes
|
||||||
# instructions can be scheduled every cycle. Optimal interleave for
|
# on 2x subroutine on Atom Silvermont account. For processors that
|
||||||
# new processor is therefore 8x, but it's unfeasible to accommodate it
|
# can schedule aes[enc|dec] every cycle optimal interleave factor
|
||||||
# in XMM registers addreassable in 32-bit mode and therefore 6x is
|
# equals to corresponding instructions latency. 8x is optimal for
|
||||||
# used instead...
|
# * Bridge, but it's unfeasible to accommodate such implementation
|
||||||
|
# in XMM registers addreassable in 32-bit mode and therefore maximum
|
||||||
|
# of 6x is used instead...
|
||||||
|
|
||||||
|
sub aesni_generate2
|
||||||
|
{ my $p=shift;
|
||||||
|
|
||||||
|
&function_begin_B("_aesni_${p}rypt2");
|
||||||
|
&$movekey ($rndkey0,&QWP(0,$key));
|
||||||
|
&shl ($rounds,4);
|
||||||
|
&$movekey ($rndkey1,&QWP(16,$key));
|
||||||
|
&xorps ($inout0,$rndkey0);
|
||||||
|
&pxor ($inout1,$rndkey0);
|
||||||
|
&$movekey ($rndkey0,&QWP(32,$key));
|
||||||
|
&lea ($key,&DWP(32,$key,$rounds));
|
||||||
|
&neg ($rounds);
|
||||||
|
&add ($rounds,16);
|
||||||
|
|
||||||
|
&set_label("${p}2_loop");
|
||||||
|
eval"&aes${p} ($inout0,$rndkey1)";
|
||||||
|
eval"&aes${p} ($inout1,$rndkey1)";
|
||||||
|
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
|
||||||
|
&add ($rounds,32);
|
||||||
|
eval"&aes${p} ($inout0,$rndkey0)";
|
||||||
|
eval"&aes${p} ($inout1,$rndkey0)";
|
||||||
|
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
|
||||||
|
&jnz (&label("${p}2_loop"));
|
||||||
|
eval"&aes${p} ($inout0,$rndkey1)";
|
||||||
|
eval"&aes${p} ($inout1,$rndkey1)";
|
||||||
|
eval"&aes${p}last ($inout0,$rndkey0)";
|
||||||
|
eval"&aes${p}last ($inout1,$rndkey0)";
|
||||||
|
&ret();
|
||||||
|
&function_end_B("_aesni_${p}rypt2");
|
||||||
|
}
|
||||||
|
|
||||||
sub aesni_generate3
|
sub aesni_generate3
|
||||||
{ my $p=shift;
|
{ my $p=shift;
|
||||||
@ -357,6 +390,8 @@ sub aesni_generate6
|
|||||||
&ret();
|
&ret();
|
||||||
&function_end_B("_aesni_${p}rypt6");
|
&function_end_B("_aesni_${p}rypt6");
|
||||||
}
|
}
|
||||||
|
&aesni_generate2("enc") if ($PREFIX eq "aesni");
|
||||||
|
&aesni_generate2("dec");
|
||||||
&aesni_generate3("enc") if ($PREFIX eq "aesni");
|
&aesni_generate3("enc") if ($PREFIX eq "aesni");
|
||||||
&aesni_generate3("dec");
|
&aesni_generate3("dec");
|
||||||
&aesni_generate4("enc") if ($PREFIX eq "aesni");
|
&aesni_generate4("enc") if ($PREFIX eq "aesni");
|
||||||
@ -460,8 +495,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&jmp (&label("ecb_ret"));
|
&jmp (&label("ecb_ret"));
|
||||||
|
|
||||||
&set_label("ecb_enc_two",16);
|
&set_label("ecb_enc_two",16);
|
||||||
&xorps ($inout2,$inout2);
|
&call ("_aesni_encrypt2");
|
||||||
&call ("_aesni_encrypt3");
|
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&movups (&QWP(0x10,$out),$inout1);
|
&movups (&QWP(0x10,$out),$inout1);
|
||||||
&jmp (&label("ecb_ret"));
|
&jmp (&label("ecb_ret"));
|
||||||
@ -561,8 +595,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&jmp (&label("ecb_ret"));
|
&jmp (&label("ecb_ret"));
|
||||||
|
|
||||||
&set_label("ecb_dec_two",16);
|
&set_label("ecb_dec_two",16);
|
||||||
&xorps ($inout2,$inout2);
|
&call ("_aesni_decrypt2");
|
||||||
&call ("_aesni_decrypt3");
|
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&movups (&QWP(0x10,$out),$inout1);
|
&movups (&QWP(0x10,$out),$inout1);
|
||||||
&jmp (&label("ecb_ret"));
|
&jmp (&label("ecb_ret"));
|
||||||
@ -982,7 +1015,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&jmp (&label("ctr32_ret"));
|
&jmp (&label("ctr32_ret"));
|
||||||
|
|
||||||
&set_label("ctr32_two",16);
|
&set_label("ctr32_two",16);
|
||||||
&call ("_aesni_encrypt3");
|
&call ("_aesni_encrypt2");
|
||||||
&movups ($inout3,&QWP(0,$inp));
|
&movups ($inout3,&QWP(0,$inp));
|
||||||
&movups ($inout4,&QWP(0x10,$inp));
|
&movups ($inout4,&QWP(0x10,$inp));
|
||||||
&xorps ($inout0,$inout3);
|
&xorps ($inout0,$inout3);
|
||||||
@ -1253,9 +1286,8 @@ if ($PREFIX eq "aesni") {
|
|||||||
&lea ($inp,&DWP(16*2,$inp));
|
&lea ($inp,&DWP(16*2,$inp));
|
||||||
&xorps ($inout0,$inout3); # input^=tweak
|
&xorps ($inout0,$inout3); # input^=tweak
|
||||||
&xorps ($inout1,$inout4);
|
&xorps ($inout1,$inout4);
|
||||||
&xorps ($inout2,$inout2);
|
|
||||||
|
|
||||||
&call ("_aesni_encrypt3");
|
&call ("_aesni_encrypt2");
|
||||||
|
|
||||||
&xorps ($inout0,$inout3); # output^=tweak
|
&xorps ($inout0,$inout3); # output^=tweak
|
||||||
&xorps ($inout1,$inout4);
|
&xorps ($inout1,$inout4);
|
||||||
@ -1596,7 +1628,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&xorps ($inout0,$inout3); # input^=tweak
|
&xorps ($inout0,$inout3); # input^=tweak
|
||||||
&xorps ($inout1,$inout4);
|
&xorps ($inout1,$inout4);
|
||||||
|
|
||||||
&call ("_aesni_decrypt3");
|
&call ("_aesni_decrypt2");
|
||||||
|
|
||||||
&xorps ($inout0,$inout3); # output^=tweak
|
&xorps ($inout0,$inout3); # output^=tweak
|
||||||
&xorps ($inout1,$inout4);
|
&xorps ($inout1,$inout4);
|
||||||
@ -1896,8 +1928,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&jmp (&label("cbc_dec_tail_collected"));
|
&jmp (&label("cbc_dec_tail_collected"));
|
||||||
|
|
||||||
&set_label("cbc_dec_two",16);
|
&set_label("cbc_dec_two",16);
|
||||||
&xorps ($inout2,$inout2);
|
&call ("_aesni_decrypt2");
|
||||||
&call ("_aesni_decrypt3");
|
|
||||||
&xorps ($inout0,$ivec);
|
&xorps ($inout0,$ivec);
|
||||||
&xorps ($inout1,$in0);
|
&xorps ($inout1,$in0);
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
|
@ -288,10 +288,49 @@ ___
|
|||||||
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
|
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
|
||||||
# utilization, i.e. when subroutine's throughput is virtually same as
|
# utilization, i.e. when subroutine's throughput is virtually same as
|
||||||
# of non-interleaved subroutine [for number of input blocks up to 3].
|
# of non-interleaved subroutine [for number of input blocks up to 3].
|
||||||
# This is why it makes no sense to implement 2x subroutine.
|
# This is why it originally made no sense to implement 2x subroutine.
|
||||||
# aes[enc|dec] latency in next processor generation is 8, but the
|
# But times change and it became appropriate to spend extra 192 bytes
|
||||||
# instructions can be scheduled every cycle. Optimal interleave for
|
# on 2x subroutine on Atom Silvermont account. For processors that
|
||||||
# new processor is therefore 8x...
|
# can schedule aes[enc|dec] every cycle optimal interleave factor
|
||||||
|
# equals to corresponding instructions latency. 8x is optimal for
|
||||||
|
# * Bridge and "super-optimal" for other Intel CPUs...
|
||||||
|
|
||||||
|
sub aesni_generate2 {
|
||||||
|
my $dir=shift;
|
||||||
|
# As already mentioned it takes in $key and $rounds, which are *not*
|
||||||
|
# preserved. $inout[0-1] is cipher/clear text...
|
||||||
|
$code.=<<___;
|
||||||
|
.type _aesni_${dir}rypt2,\@abi-omnipotent
|
||||||
|
.align 16
|
||||||
|
_aesni_${dir}rypt2:
|
||||||
|
$movkey ($key),$rndkey0
|
||||||
|
shl \$4,$rounds
|
||||||
|
$movkey 16($key),$rndkey1
|
||||||
|
xorps $rndkey0,$inout0
|
||||||
|
xorps $rndkey0,$inout1
|
||||||
|
$movkey 32($key),$rndkey0
|
||||||
|
lea 32($key,$rounds),$key
|
||||||
|
neg %rax # $rounds
|
||||||
|
add \$16,%rax
|
||||||
|
|
||||||
|
.L${dir}_loop2:
|
||||||
|
aes${dir} $rndkey1,$inout0
|
||||||
|
aes${dir} $rndkey1,$inout1
|
||||||
|
$movkey ($key,%rax),$rndkey1
|
||||||
|
add \$32,%rax
|
||||||
|
aes${dir} $rndkey0,$inout0
|
||||||
|
aes${dir} $rndkey0,$inout1
|
||||||
|
$movkey -16($key,%rax),$rndkey0
|
||||||
|
jnz .L${dir}_loop2
|
||||||
|
|
||||||
|
aes${dir} $rndkey1,$inout0
|
||||||
|
aes${dir} $rndkey1,$inout1
|
||||||
|
aes${dir}last $rndkey0,$inout0
|
||||||
|
aes${dir}last $rndkey0,$inout1
|
||||||
|
ret
|
||||||
|
.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
|
||||||
|
___
|
||||||
|
}
|
||||||
sub aesni_generate3 {
|
sub aesni_generate3 {
|
||||||
my $dir=shift;
|
my $dir=shift;
|
||||||
# As already mentioned it takes in $key and $rounds, which are *not*
|
# As already mentioned it takes in $key and $rounds, which are *not*
|
||||||
@ -524,6 +563,8 @@ _aesni_${dir}rypt8:
|
|||||||
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
|
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
&aesni_generate2("enc") if ($PREFIX eq "aesni");
|
||||||
|
&aesni_generate2("dec");
|
||||||
&aesni_generate3("enc") if ($PREFIX eq "aesni");
|
&aesni_generate3("enc") if ($PREFIX eq "aesni");
|
||||||
&aesni_generate3("dec");
|
&aesni_generate3("dec");
|
||||||
&aesni_generate4("enc") if ($PREFIX eq "aesni");
|
&aesni_generate4("enc") if ($PREFIX eq "aesni");
|
||||||
@ -645,8 +686,7 @@ $code.=<<___;
|
|||||||
jmp .Lecb_ret
|
jmp .Lecb_ret
|
||||||
.align 16
|
.align 16
|
||||||
.Lecb_enc_two:
|
.Lecb_enc_two:
|
||||||
xorps $inout2,$inout2
|
call _aesni_encrypt2
|
||||||
call _aesni_encrypt3
|
|
||||||
movups $inout0,($out)
|
movups $inout0,($out)
|
||||||
movups $inout1,0x10($out)
|
movups $inout1,0x10($out)
|
||||||
jmp .Lecb_ret
|
jmp .Lecb_ret
|
||||||
@ -782,8 +822,7 @@ $code.=<<___;
|
|||||||
jmp .Lecb_ret
|
jmp .Lecb_ret
|
||||||
.align 16
|
.align 16
|
||||||
.Lecb_dec_two:
|
.Lecb_dec_two:
|
||||||
xorps $inout2,$inout2
|
call _aesni_decrypt2
|
||||||
call _aesni_decrypt3
|
|
||||||
movups $inout0,($out)
|
movups $inout0,($out)
|
||||||
movups $inout1,0x10($out)
|
movups $inout1,0x10($out)
|
||||||
jmp .Lecb_ret
|
jmp .Lecb_ret
|
||||||
@ -1875,7 +1914,7 @@ $code.=<<___;
|
|||||||
xorps @tweak[0],$inout0
|
xorps @tweak[0],$inout0
|
||||||
xorps @tweak[1],$inout1
|
xorps @tweak[1],$inout1
|
||||||
|
|
||||||
call _aesni_encrypt3
|
call _aesni_encrypt2
|
||||||
|
|
||||||
xorps @tweak[0],$inout0
|
xorps @tweak[0],$inout0
|
||||||
movdqa @tweak[2],@tweak[0]
|
movdqa @tweak[2],@tweak[0]
|
||||||
@ -2322,7 +2361,7 @@ $code.=<<___;
|
|||||||
xorps @tweak[0],$inout0
|
xorps @tweak[0],$inout0
|
||||||
xorps @tweak[1],$inout1
|
xorps @tweak[1],$inout1
|
||||||
|
|
||||||
call _aesni_decrypt3
|
call _aesni_decrypt2
|
||||||
|
|
||||||
xorps @tweak[0],$inout0
|
xorps @tweak[0],$inout0
|
||||||
movdqa @tweak[2],@tweak[0]
|
movdqa @tweak[2],@tweak[0]
|
||||||
@ -2831,8 +2870,7 @@ $code.=<<___;
|
|||||||
.align 16
|
.align 16
|
||||||
.Lcbc_dec_two:
|
.Lcbc_dec_two:
|
||||||
movaps $inout1,$in1
|
movaps $inout1,$in1
|
||||||
xorps $inout2,$inout2
|
call _aesni_decrypt2
|
||||||
call _aesni_decrypt3
|
|
||||||
pxor $iv,$inout0
|
pxor $iv,$inout0
|
||||||
movaps $in1,$iv
|
movaps $in1,$iv
|
||||||
pxor $in0,$inout1
|
pxor $in0,$inout1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user