aes/asm/aesni-x86[_64].pl update.
This addresses - request for improvement for faster key setup in RT#3576; - clearing registers and stack in RT#3554 (this is more of a gesture to see if there will be some traction from compiler side); - more commentary around input parameters handling and stack layout (desired when RT#3553 was reviewed); - minor size and single block performance optimization (was lying around); Reviewed-by: Matt Caswell <matt@openssl.org>
This commit is contained in:
parent
313e6ec11f
commit
23f6eec71d
@ -51,7 +51,7 @@
|
|||||||
# Westmere 3.77/1.37 1.37 1.52 1.27
|
# Westmere 3.77/1.37 1.37 1.52 1.27
|
||||||
# * Bridge 5.07/0.98 0.99 1.09 0.91
|
# * Bridge 5.07/0.98 0.99 1.09 0.91
|
||||||
# Haswell 4.44/0.80 0.97 1.03 0.72
|
# Haswell 4.44/0.80 0.97 1.03 0.72
|
||||||
# Atom 5.77/3.56 3.67 4.03 3.46
|
# Silvermont 5.77/3.56 3.67 4.03 3.46
|
||||||
# Bulldozer 5.80/0.98 1.05 1.24 0.93
|
# Bulldozer 5.80/0.98 1.05 1.24 0.93
|
||||||
|
|
||||||
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
||||||
@ -65,6 +65,9 @@ require "x86asm.pl";
|
|||||||
|
|
||||||
&asm_init($ARGV[0],$0);
|
&asm_init($ARGV[0],$0);
|
||||||
|
|
||||||
|
&external_label("OPENSSL_ia32cap_P");
|
||||||
|
&static_label("key_const");
|
||||||
|
|
||||||
if ($PREFIX eq "aesni") { $movekey=\&movups; }
|
if ($PREFIX eq "aesni") { $movekey=\&movups; }
|
||||||
else { $movekey=\&movups; }
|
else { $movekey=\&movups; }
|
||||||
|
|
||||||
@ -181,7 +184,10 @@ sub aesni_generate1 # fully unrolled loop
|
|||||||
{ &aesni_inline_generate1("enc"); }
|
{ &aesni_inline_generate1("enc"); }
|
||||||
else
|
else
|
||||||
{ &call ("_aesni_encrypt1"); }
|
{ &call ("_aesni_encrypt1"); }
|
||||||
|
&pxor ($rndkey0,$rndkey0); # clear register bank
|
||||||
|
&pxor ($rndkey1,$rndkey1);
|
||||||
&movups (&QWP(0,"eax"),$inout0);
|
&movups (&QWP(0,"eax"),$inout0);
|
||||||
|
&pxor ($inout0,$inout0);
|
||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("${PREFIX}_encrypt");
|
&function_end_B("${PREFIX}_encrypt");
|
||||||
|
|
||||||
@ -197,7 +203,10 @@ sub aesni_generate1 # fully unrolled loop
|
|||||||
{ &aesni_inline_generate1("dec"); }
|
{ &aesni_inline_generate1("dec"); }
|
||||||
else
|
else
|
||||||
{ &call ("_aesni_decrypt1"); }
|
{ &call ("_aesni_decrypt1"); }
|
||||||
|
&pxor ($rndkey0,$rndkey0); # clear register bank
|
||||||
|
&pxor ($rndkey1,$rndkey1);
|
||||||
&movups (&QWP(0,"eax"),$inout0);
|
&movups (&QWP(0,"eax"),$inout0);
|
||||||
|
&pxor ($inout0,$inout0);
|
||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("${PREFIX}_decrypt");
|
&function_end_B("${PREFIX}_decrypt");
|
||||||
|
|
||||||
@ -349,17 +358,15 @@ sub aesni_generate6
|
|||||||
&neg ($rounds);
|
&neg ($rounds);
|
||||||
eval"&aes${p} ($inout2,$rndkey1)";
|
eval"&aes${p} ($inout2,$rndkey1)";
|
||||||
&pxor ($inout5,$rndkey0);
|
&pxor ($inout5,$rndkey0);
|
||||||
|
&$movekey ($rndkey0,&QWP(0,$key,$rounds));
|
||||||
&add ($rounds,16);
|
&add ($rounds,16);
|
||||||
eval"&aes${p} ($inout3,$rndkey1)";
|
&jmp (&label("_aesni_${p}rypt6_inner"));
|
||||||
eval"&aes${p} ($inout4,$rndkey1)";
|
|
||||||
eval"&aes${p} ($inout5,$rndkey1)";
|
|
||||||
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
|
|
||||||
&jmp (&label("_aesni_${p}rypt6_enter"));
|
|
||||||
|
|
||||||
&set_label("${p}6_loop",16);
|
&set_label("${p}6_loop",16);
|
||||||
eval"&aes${p} ($inout0,$rndkey1)";
|
eval"&aes${p} ($inout0,$rndkey1)";
|
||||||
eval"&aes${p} ($inout1,$rndkey1)";
|
eval"&aes${p} ($inout1,$rndkey1)";
|
||||||
eval"&aes${p} ($inout2,$rndkey1)";
|
eval"&aes${p} ($inout2,$rndkey1)";
|
||||||
|
&set_label("_aesni_${p}rypt6_inner");
|
||||||
eval"&aes${p} ($inout3,$rndkey1)";
|
eval"&aes${p} ($inout3,$rndkey1)";
|
||||||
eval"&aes${p} ($inout4,$rndkey1)";
|
eval"&aes${p} ($inout4,$rndkey1)";
|
||||||
eval"&aes${p} ($inout5,$rndkey1)";
|
eval"&aes${p} ($inout5,$rndkey1)";
|
||||||
@ -615,6 +622,14 @@ if ($PREFIX eq "aesni") {
|
|||||||
&movups (&QWP(0x30,$out),$inout3);
|
&movups (&QWP(0x30,$out),$inout3);
|
||||||
|
|
||||||
&set_label("ecb_ret");
|
&set_label("ecb_ret");
|
||||||
|
&pxor ("xmm0","xmm0"); # clear register bank
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&pxor ("xmm6","xmm6");
|
||||||
|
&pxor ("xmm7","xmm7");
|
||||||
&function_end("aesni_ecb_encrypt");
|
&function_end("aesni_ecb_encrypt");
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
@ -704,6 +719,15 @@ if ($PREFIX eq "aesni") {
|
|||||||
&mov ("esp",&DWP(48,"esp"));
|
&mov ("esp",&DWP(48,"esp"));
|
||||||
&mov ($out,&wparam(5));
|
&mov ($out,&wparam(5));
|
||||||
&movups (&QWP(0,$out),$cmac);
|
&movups (&QWP(0,$out),$cmac);
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm0"); # clear register bank
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&pxor ("xmm6","xmm6");
|
||||||
|
&pxor ("xmm7","xmm7");
|
||||||
&function_end("aesni_ccm64_encrypt_blocks");
|
&function_end("aesni_ccm64_encrypt_blocks");
|
||||||
|
|
||||||
&function_begin("aesni_ccm64_decrypt_blocks");
|
&function_begin("aesni_ccm64_decrypt_blocks");
|
||||||
@ -804,6 +828,15 @@ if ($PREFIX eq "aesni") {
|
|||||||
&mov ("esp",&DWP(48,"esp"));
|
&mov ("esp",&DWP(48,"esp"));
|
||||||
&mov ($out,&wparam(5));
|
&mov ($out,&wparam(5));
|
||||||
&movups (&QWP(0,$out),$cmac);
|
&movups (&QWP(0,$out),$cmac);
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm0"); # clear register bank
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&pxor ("xmm6","xmm6");
|
||||||
|
&pxor ("xmm7","xmm7");
|
||||||
&function_end("aesni_ccm64_decrypt_blocks");
|
&function_end("aesni_ccm64_decrypt_blocks");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1053,6 +1086,17 @@ if ($PREFIX eq "aesni") {
|
|||||||
&movups (&QWP(0x30,$out),$inout3);
|
&movups (&QWP(0x30,$out),$inout3);
|
||||||
|
|
||||||
&set_label("ctr32_ret");
|
&set_label("ctr32_ret");
|
||||||
|
&pxor ("xmm0","xmm0"); # clear register bank
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&movdqa (&QWP(48,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm6","xmm6");
|
||||||
|
&movdqa (&QWP(64,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm7","xmm7");
|
||||||
&mov ("esp",&DWP(80,"esp"));
|
&mov ("esp",&DWP(80,"esp"));
|
||||||
&function_end("aesni_ctr32_encrypt_blocks");
|
&function_end("aesni_ctr32_encrypt_blocks");
|
||||||
|
|
||||||
@ -1394,6 +1438,20 @@ if ($PREFIX eq "aesni") {
|
|||||||
&movups (&QWP(-16,$out),$inout0); # write output
|
&movups (&QWP(-16,$out),$inout0); # write output
|
||||||
|
|
||||||
&set_label("xts_enc_ret");
|
&set_label("xts_enc_ret");
|
||||||
|
&pxor ("xmm0","xmm0"); # clear register bank
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&movdqa (&QWP(16*1,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&movdqa (&QWP(16*2,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&movdqa (&QWP(16*3,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm6","xmm6");
|
||||||
|
&movdqa (&QWP(16*4,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm7","xmm7");
|
||||||
|
&movdqa (&QWP(16*5,"esp"),"xmm0");
|
||||||
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
|
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
|
||||||
&function_end("aesni_xts_encrypt");
|
&function_end("aesni_xts_encrypt");
|
||||||
|
|
||||||
@ -1756,6 +1814,20 @@ if ($PREFIX eq "aesni") {
|
|||||||
&movups (&QWP(0,$out),$inout0); # write output
|
&movups (&QWP(0,$out),$inout0); # write output
|
||||||
|
|
||||||
&set_label("xts_dec_ret");
|
&set_label("xts_dec_ret");
|
||||||
|
&pxor ("xmm0","xmm0"); # clear register bank
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&movdqa (&QWP(16*1,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&movdqa (&QWP(16*2,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&movdqa (&QWP(16*3,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm6","xmm6");
|
||||||
|
&movdqa (&QWP(16*4,"esp"),"xmm0");
|
||||||
|
&pxor ("xmm7","xmm7");
|
||||||
|
&movdqa (&QWP(16*5,"esp"),"xmm0");
|
||||||
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
|
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
|
||||||
&function_end("aesni_xts_decrypt");
|
&function_end("aesni_xts_decrypt");
|
||||||
}
|
}
|
||||||
@ -1808,6 +1880,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&add ($len,16);
|
&add ($len,16);
|
||||||
&jnz (&label("cbc_enc_tail"));
|
&jnz (&label("cbc_enc_tail"));
|
||||||
&movaps ($ivec,$inout0);
|
&movaps ($ivec,$inout0);
|
||||||
|
&pxor ($inout0,$inout0);
|
||||||
&jmp (&label("cbc_ret"));
|
&jmp (&label("cbc_ret"));
|
||||||
|
|
||||||
&set_label("cbc_enc_tail");
|
&set_label("cbc_enc_tail");
|
||||||
@ -1871,7 +1944,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&movaps ($inout0,$inout5);
|
&movaps ($inout0,$inout5);
|
||||||
&movaps ($ivec,$rndkey0);
|
&movaps ($ivec,$rndkey0);
|
||||||
&add ($len,0x50);
|
&add ($len,0x50);
|
||||||
&jle (&label("cbc_dec_tail_collected"));
|
&jle (&label("cbc_dec_clear_tail_collected"));
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&lea ($out,&DWP(0x10,$out));
|
&lea ($out,&DWP(0x10,$out));
|
||||||
&set_label("cbc_dec_tail");
|
&set_label("cbc_dec_tail");
|
||||||
@ -1910,10 +1983,14 @@ if ($PREFIX eq "aesni") {
|
|||||||
&xorps ($inout4,$rndkey0);
|
&xorps ($inout4,$rndkey0);
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&movups (&QWP(0x10,$out),$inout1);
|
&movups (&QWP(0x10,$out),$inout1);
|
||||||
|
&pxor ($inout1,$inout1);
|
||||||
&movups (&QWP(0x20,$out),$inout2);
|
&movups (&QWP(0x20,$out),$inout2);
|
||||||
|
&pxor ($inout2,$inout2);
|
||||||
&movups (&QWP(0x30,$out),$inout3);
|
&movups (&QWP(0x30,$out),$inout3);
|
||||||
|
&pxor ($inout3,$inout3);
|
||||||
&lea ($out,&DWP(0x40,$out));
|
&lea ($out,&DWP(0x40,$out));
|
||||||
&movaps ($inout0,$inout4);
|
&movaps ($inout0,$inout4);
|
||||||
|
&pxor ($inout4,$inout4);
|
||||||
&sub ($len,0x50);
|
&sub ($len,0x50);
|
||||||
&jmp (&label("cbc_dec_tail_collected"));
|
&jmp (&label("cbc_dec_tail_collected"));
|
||||||
|
|
||||||
@ -1933,6 +2010,7 @@ if ($PREFIX eq "aesni") {
|
|||||||
&xorps ($inout1,$in0);
|
&xorps ($inout1,$in0);
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&movaps ($inout0,$inout1);
|
&movaps ($inout0,$inout1);
|
||||||
|
&pxor ($inout1,$inout1);
|
||||||
&lea ($out,&DWP(0x10,$out));
|
&lea ($out,&DWP(0x10,$out));
|
||||||
&movaps ($ivec,$in1);
|
&movaps ($ivec,$in1);
|
||||||
&sub ($len,0x20);
|
&sub ($len,0x20);
|
||||||
@ -1945,7 +2023,9 @@ if ($PREFIX eq "aesni") {
|
|||||||
&xorps ($inout2,$in1);
|
&xorps ($inout2,$in1);
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&movaps ($inout0,$inout2);
|
&movaps ($inout0,$inout2);
|
||||||
|
&pxor ($inout2,$inout2);
|
||||||
&movups (&QWP(0x10,$out),$inout1);
|
&movups (&QWP(0x10,$out),$inout1);
|
||||||
|
&pxor ($inout1,$inout1);
|
||||||
&lea ($out,&DWP(0x20,$out));
|
&lea ($out,&DWP(0x20,$out));
|
||||||
&movups ($ivec,&QWP(0x20,$inp));
|
&movups ($ivec,&QWP(0x20,$inp));
|
||||||
&sub ($len,0x30);
|
&sub ($len,0x30);
|
||||||
@ -1961,29 +2041,44 @@ if ($PREFIX eq "aesni") {
|
|||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
&xorps ($inout2,$rndkey1);
|
&xorps ($inout2,$rndkey1);
|
||||||
&movups (&QWP(0x10,$out),$inout1);
|
&movups (&QWP(0x10,$out),$inout1);
|
||||||
|
&pxor ($inout1,$inout1);
|
||||||
&xorps ($inout3,$rndkey0);
|
&xorps ($inout3,$rndkey0);
|
||||||
&movups (&QWP(0x20,$out),$inout2);
|
&movups (&QWP(0x20,$out),$inout2);
|
||||||
|
&pxor ($inout2,$inout2);
|
||||||
&lea ($out,&DWP(0x30,$out));
|
&lea ($out,&DWP(0x30,$out));
|
||||||
&movaps ($inout0,$inout3);
|
&movaps ($inout0,$inout3);
|
||||||
|
&pxor ($inout3,$inout3);
|
||||||
&sub ($len,0x40);
|
&sub ($len,0x40);
|
||||||
|
&jmp (&label("cbc_dec_tail_collected"));
|
||||||
|
|
||||||
|
&set_label("cbc_dec_clear_tail_collected",16);
|
||||||
|
&pxor ($inout1,$inout1);
|
||||||
|
&pxor ($inout2,$inout2);
|
||||||
|
&pxor ($inout3,$inout3);
|
||||||
|
&pxor ($inout4,$inout4);
|
||||||
&set_label("cbc_dec_tail_collected");
|
&set_label("cbc_dec_tail_collected");
|
||||||
&and ($len,15);
|
&and ($len,15);
|
||||||
&jnz (&label("cbc_dec_tail_partial"));
|
&jnz (&label("cbc_dec_tail_partial"));
|
||||||
&movups (&QWP(0,$out),$inout0);
|
&movups (&QWP(0,$out),$inout0);
|
||||||
|
&pxor ($rndkey0,$rndkey0);
|
||||||
&jmp (&label("cbc_ret"));
|
&jmp (&label("cbc_ret"));
|
||||||
|
|
||||||
&set_label("cbc_dec_tail_partial",16);
|
&set_label("cbc_dec_tail_partial",16);
|
||||||
&movaps (&QWP(0,"esp"),$inout0);
|
&movaps (&QWP(0,"esp"),$inout0);
|
||||||
|
&pxor ($rndkey0,$rndkey0);
|
||||||
&mov ("ecx",16);
|
&mov ("ecx",16);
|
||||||
&mov ($inp,"esp");
|
&mov ($inp,"esp");
|
||||||
&sub ("ecx",$len);
|
&sub ("ecx",$len);
|
||||||
&data_word(0xA4F3F689); # rep movsb
|
&data_word(0xA4F3F689); # rep movsb
|
||||||
|
&movdqa (&QWP(0,"esp"),$inout0);
|
||||||
|
|
||||||
&set_label("cbc_ret");
|
&set_label("cbc_ret");
|
||||||
&mov ("esp",&DWP(16,"esp")); # pull original %esp
|
&mov ("esp",&DWP(16,"esp")); # pull original %esp
|
||||||
&mov ($key_,&wparam(4));
|
&mov ($key_,&wparam(4));
|
||||||
|
&pxor ($inout0,$inout0);
|
||||||
|
&pxor ($rndkey1,$rndkey1);
|
||||||
&movups (&QWP(0,$key_),$ivec); # output IV
|
&movups (&QWP(0,$key_),$ivec); # output IV
|
||||||
|
&pxor ($ivec,$ivec);
|
||||||
&set_label("cbc_abort");
|
&set_label("cbc_abort");
|
||||||
&function_end("${PREFIX}_cbc_encrypt");
|
&function_end("${PREFIX}_cbc_encrypt");
|
||||||
|
|
||||||
@ -2000,14 +2095,24 @@ if ($PREFIX eq "aesni") {
|
|||||||
# $round rounds
|
# $round rounds
|
||||||
|
|
||||||
&function_begin_B("_aesni_set_encrypt_key");
|
&function_begin_B("_aesni_set_encrypt_key");
|
||||||
|
&push ("ebp");
|
||||||
|
&push ("ebx");
|
||||||
&test ("eax","eax");
|
&test ("eax","eax");
|
||||||
&jz (&label("bad_pointer"));
|
&jz (&label("bad_pointer"));
|
||||||
&test ($key,$key);
|
&test ($key,$key);
|
||||||
&jz (&label("bad_pointer"));
|
&jz (&label("bad_pointer"));
|
||||||
|
|
||||||
|
&call (&label("pic"));
|
||||||
|
&set_label("pic");
|
||||||
|
&blindpop("ebx");
|
||||||
|
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
|
||||||
|
|
||||||
|
&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
|
||||||
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
|
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
|
||||||
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
|
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
|
||||||
|
&mov ("ebp",&DWP(4,"ebp"));
|
||||||
&lea ($key,&DWP(16,$key));
|
&lea ($key,&DWP(16,$key));
|
||||||
|
&and ("ebp",1<<28|1<<11); # AVX and XOP bits
|
||||||
&cmp ($rounds,256);
|
&cmp ($rounds,256);
|
||||||
&je (&label("14rounds"));
|
&je (&label("14rounds"));
|
||||||
&cmp ($rounds,192);
|
&cmp ($rounds,192);
|
||||||
@ -2016,6 +2121,9 @@ if ($PREFIX eq "aesni") {
|
|||||||
&jne (&label("bad_keybits"));
|
&jne (&label("bad_keybits"));
|
||||||
|
|
||||||
&set_label("10rounds",16);
|
&set_label("10rounds",16);
|
||||||
|
&cmp ("ebp",1<<28);
|
||||||
|
&je (&label("10rounds_alt"));
|
||||||
|
|
||||||
&mov ($rounds,9);
|
&mov ($rounds,9);
|
||||||
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
|
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
|
||||||
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
|
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
|
||||||
@ -2040,8 +2148,8 @@ if ($PREFIX eq "aesni") {
|
|||||||
&call (&label("key_128"));
|
&call (&label("key_128"));
|
||||||
&$movekey (&QWP(0,$key),"xmm0");
|
&$movekey (&QWP(0,$key),"xmm0");
|
||||||
&mov (&DWP(80,$key),$rounds);
|
&mov (&DWP(80,$key),$rounds);
|
||||||
&xor ("eax","eax");
|
|
||||||
&ret();
|
&jmp (&label("good_key"));
|
||||||
|
|
||||||
&set_label("key_128",16);
|
&set_label("key_128",16);
|
||||||
&$movekey (&QWP(0,$key),"xmm0");
|
&$movekey (&QWP(0,$key),"xmm0");
|
||||||
@ -2055,8 +2163,76 @@ if ($PREFIX eq "aesni") {
|
|||||||
&xorps ("xmm0","xmm1");
|
&xorps ("xmm0","xmm1");
|
||||||
&ret();
|
&ret();
|
||||||
|
|
||||||
|
&set_label("10rounds_alt",16);
|
||||||
|
&movdqa ("xmm5",&QWP(0x00,"ebx"));
|
||||||
|
&mov ($rounds,8);
|
||||||
|
&movdqa ("xmm4",&QWP(0x20,"ebx"));
|
||||||
|
&movdqa ("xmm2","xmm0");
|
||||||
|
&movdqu (&DWP(-16,$key),"xmm0");
|
||||||
|
|
||||||
|
&set_label("loop_key128");
|
||||||
|
&pshufb ("xmm0","xmm5");
|
||||||
|
&aesenclast ("xmm0","xmm4");
|
||||||
|
&pslld ("xmm4",1);
|
||||||
|
&lea ($key,&DWP(16,$key));
|
||||||
|
|
||||||
|
&movdqa ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm2","xmm3");
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm2");
|
||||||
|
&movdqu (&QWP(-16,$key),"xmm0");
|
||||||
|
&movdqa ("xmm2","xmm0");
|
||||||
|
|
||||||
|
&dec ($rounds);
|
||||||
|
&jnz (&label("loop_key128"));
|
||||||
|
|
||||||
|
&movdqa ("xmm4",&QWP(0x30,"ebx"));
|
||||||
|
|
||||||
|
&pshufb ("xmm0","xmm5");
|
||||||
|
&aesenclast ("xmm0","xmm4");
|
||||||
|
&pslld ("xmm4",1);
|
||||||
|
|
||||||
|
&movdqa ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm2","xmm3");
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm2");
|
||||||
|
&movdqu (&QWP(0,$key),"xmm0");
|
||||||
|
|
||||||
|
&movdqa ("xmm2","xmm0");
|
||||||
|
&pshufb ("xmm0","xmm5");
|
||||||
|
&aesenclast ("xmm0","xmm4");
|
||||||
|
|
||||||
|
&movdqa ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm3","xmm2");
|
||||||
|
&pslldq ("xmm2",4);
|
||||||
|
&pxor ("xmm2","xmm3");
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm2");
|
||||||
|
&movdqu (&QWP(16,$key),"xmm0");
|
||||||
|
|
||||||
|
&mov ($rounds,9);
|
||||||
|
&mov (&DWP(96,$key),$rounds);
|
||||||
|
|
||||||
|
&jmp (&label("good_key"));
|
||||||
|
|
||||||
&set_label("12rounds",16);
|
&set_label("12rounds",16);
|
||||||
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
|
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
|
||||||
|
&cmp ("ebp",1<<28);
|
||||||
|
&je (&label("12rounds_alt"));
|
||||||
|
|
||||||
&mov ($rounds,11);
|
&mov ($rounds,11);
|
||||||
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
|
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
|
||||||
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
|
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
|
||||||
@ -2077,8 +2253,8 @@ if ($PREFIX eq "aesni") {
|
|||||||
&call (&label("key_192b"));
|
&call (&label("key_192b"));
|
||||||
&$movekey (&QWP(0,$key),"xmm0");
|
&$movekey (&QWP(0,$key),"xmm0");
|
||||||
&mov (&DWP(48,$key),$rounds);
|
&mov (&DWP(48,$key),$rounds);
|
||||||
&xor ("eax","eax");
|
|
||||||
&ret();
|
&jmp (&label("good_key"));
|
||||||
|
|
||||||
&set_label("key_192a",16);
|
&set_label("key_192a",16);
|
||||||
&$movekey (&QWP(0,$key),"xmm0");
|
&$movekey (&QWP(0,$key),"xmm0");
|
||||||
@ -2108,10 +2284,52 @@ if ($PREFIX eq "aesni") {
|
|||||||
&lea ($key,&DWP(32,$key));
|
&lea ($key,&DWP(32,$key));
|
||||||
&jmp (&label("key_192b_warm"));
|
&jmp (&label("key_192b_warm"));
|
||||||
|
|
||||||
|
&set_label("12rounds_alt",16);
|
||||||
|
&movdqa ("xmm5",&QWP(0x10,"ebx"));
|
||||||
|
&movdqa ("xmm4",&QWP(0x20,"ebx"));
|
||||||
|
&mov ($rounds,8);
|
||||||
|
&movdqu (&QWP(-16,$key),"xmm0");
|
||||||
|
|
||||||
|
&set_label("loop_key192");
|
||||||
|
&movq (&QWP(0,$key),"xmm2");
|
||||||
|
&movdqa ("xmm1","xmm2");
|
||||||
|
&pshufb ("xmm2","xmm5");
|
||||||
|
&aesenclast ("xmm2","xmm4");
|
||||||
|
&pslld ("xmm4",1);
|
||||||
|
&lea ($key,&DWP(24,$key));
|
||||||
|
|
||||||
|
&movdqa ("xmm3","xmm0");
|
||||||
|
&pslldq ("xmm0",4);
|
||||||
|
&pxor ("xmm3","xmm0");
|
||||||
|
&pslldq ("xmm0",4);
|
||||||
|
&pxor ("xmm3","xmm0");
|
||||||
|
&pslldq ("xmm0",4);
|
||||||
|
&pxor ("xmm0","xmm3");
|
||||||
|
|
||||||
|
&pshufd ("xmm3","xmm0",0xff);
|
||||||
|
&pxor ("xmm3","xmm1");
|
||||||
|
&pslldq ("xmm1",4);
|
||||||
|
&pxor ("xmm3","xmm1");
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm2");
|
||||||
|
&pxor ("xmm2","xmm3");
|
||||||
|
&movdqu (&QWP(-16,$key),"xmm0");
|
||||||
|
|
||||||
|
&dec ($rounds);
|
||||||
|
&jnz (&label("loop_key192"));
|
||||||
|
|
||||||
|
&mov ($rounds,11);
|
||||||
|
&mov (&DWP(32,$key),$rounds);
|
||||||
|
|
||||||
|
&jmp (&label("good_key"));
|
||||||
|
|
||||||
&set_label("14rounds",16);
|
&set_label("14rounds",16);
|
||||||
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
|
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
|
||||||
&mov ($rounds,13);
|
|
||||||
&lea ($key,&DWP(16,$key));
|
&lea ($key,&DWP(16,$key));
|
||||||
|
&cmp ("ebp",1<<28);
|
||||||
|
&je (&label("14rounds_alt"));
|
||||||
|
|
||||||
|
&mov ($rounds,13);
|
||||||
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
|
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
|
||||||
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
|
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
|
||||||
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
|
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
|
||||||
@ -2143,7 +2361,8 @@ if ($PREFIX eq "aesni") {
|
|||||||
&$movekey (&QWP(0,$key),"xmm0");
|
&$movekey (&QWP(0,$key),"xmm0");
|
||||||
&mov (&DWP(16,$key),$rounds);
|
&mov (&DWP(16,$key),$rounds);
|
||||||
&xor ("eax","eax");
|
&xor ("eax","eax");
|
||||||
&ret();
|
|
||||||
|
&jmp (&label("good_key"));
|
||||||
|
|
||||||
&set_label("key_256a",16);
|
&set_label("key_256a",16);
|
||||||
&$movekey (&QWP(0,$key),"xmm2");
|
&$movekey (&QWP(0,$key),"xmm2");
|
||||||
@ -2169,11 +2388,77 @@ if ($PREFIX eq "aesni") {
|
|||||||
&xorps ("xmm2","xmm1");
|
&xorps ("xmm2","xmm1");
|
||||||
&ret();
|
&ret();
|
||||||
|
|
||||||
|
&set_label("14rounds_alt",16);
|
||||||
|
&movdqa ("xmm5",&QWP(0x00,"ebx"));
|
||||||
|
&movdqa ("xmm4",&QWP(0x20,"ebx"));
|
||||||
|
&mov ($rounds,7);
|
||||||
|
&movdqu (&QWP(-32,$key),"xmm0");
|
||||||
|
&movdqa ("xmm1","xmm2");
|
||||||
|
&movdqu (&QWP(-16,$key),"xmm2");
|
||||||
|
|
||||||
|
&set_label("loop_key256");
|
||||||
|
&pshufb ("xmm2","xmm5");
|
||||||
|
&aesenclast ("xmm2","xmm4");
|
||||||
|
|
||||||
|
&movdqa ("xmm3","xmm0");
|
||||||
|
&pslldq ("xmm0",4);
|
||||||
|
&pxor ("xmm3","xmm0");
|
||||||
|
&pslldq ("xmm0",4);
|
||||||
|
&pxor ("xmm3","xmm0");
|
||||||
|
&pslldq ("xmm0",4);
|
||||||
|
&pxor ("xmm0","xmm3");
|
||||||
|
&pslld ("xmm4",1);
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm2");
|
||||||
|
&movdqu (&QWP(0,$key),"xmm0");
|
||||||
|
|
||||||
|
&dec ($rounds);
|
||||||
|
&jz (&label("done_key256"));
|
||||||
|
|
||||||
|
&pshufd ("xmm2","xmm0",0xff);
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&aesenclast ("xmm2","xmm3");
|
||||||
|
|
||||||
|
&movdqa ("xmm3","xmm1")
|
||||||
|
&pslldq ("xmm1",4);
|
||||||
|
&pxor ("xmm3","xmm1");
|
||||||
|
&pslldq ("xmm1",4);
|
||||||
|
&pxor ("xmm3","xmm1");
|
||||||
|
&pslldq ("xmm1",4);
|
||||||
|
&pxor ("xmm1","xmm3");
|
||||||
|
|
||||||
|
&pxor ("xmm2","xmm1");
|
||||||
|
&movdqu (&QWP(16,$key),"xmm2");
|
||||||
|
&lea ($key,&DWP(32,$key));
|
||||||
|
&movdqa ("xmm1","xmm2");
|
||||||
|
&jmp (&label("loop_key256"));
|
||||||
|
|
||||||
|
&set_label("done_key256");
|
||||||
|
&mov ($rounds,13);
|
||||||
|
&mov (&DWP(16,$key),$rounds);
|
||||||
|
|
||||||
|
&set_label("good_key");
|
||||||
|
&pxor ("xmm0","xmm0");
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
|
&pxor ("xmm2","xmm2");
|
||||||
|
&pxor ("xmm3","xmm3");
|
||||||
|
&pxor ("xmm4","xmm4");
|
||||||
|
&pxor ("xmm5","xmm5");
|
||||||
|
&xor ("eax","eax");
|
||||||
|
&pop ("ebx");
|
||||||
|
&pop ("ebp");
|
||||||
|
&ret ();
|
||||||
|
|
||||||
&set_label("bad_pointer",4);
|
&set_label("bad_pointer",4);
|
||||||
&mov ("eax",-1);
|
&mov ("eax",-1);
|
||||||
|
&pop ("ebx");
|
||||||
|
&pop ("ebp");
|
||||||
&ret ();
|
&ret ();
|
||||||
&set_label("bad_keybits",4);
|
&set_label("bad_keybits",4);
|
||||||
|
&pxor ("xmm0","xmm0");
|
||||||
&mov ("eax",-2);
|
&mov ("eax",-2);
|
||||||
|
&pop ("ebx");
|
||||||
|
&pop ("ebp");
|
||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("_aesni_set_encrypt_key");
|
&function_end_B("_aesni_set_encrypt_key");
|
||||||
|
|
||||||
@ -2223,10 +2508,18 @@ if ($PREFIX eq "aesni") {
|
|||||||
&aesimc ("xmm0","xmm0");
|
&aesimc ("xmm0","xmm0");
|
||||||
&$movekey (&QWP(0,$key),"xmm0");
|
&$movekey (&QWP(0,$key),"xmm0");
|
||||||
|
|
||||||
|
&pxor ("xmm0","xmm0");
|
||||||
|
&pxor ("xmm1","xmm1");
|
||||||
&xor ("eax","eax"); # return success
|
&xor ("eax","eax"); # return success
|
||||||
&set_label("dec_key_ret");
|
&set_label("dec_key_ret");
|
||||||
&ret ();
|
&ret ();
|
||||||
&function_end_B("${PREFIX}_set_decrypt_key");
|
&function_end_B("${PREFIX}_set_decrypt_key");
|
||||||
|
|
||||||
|
&set_label("key_const",64);
|
||||||
|
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
|
||||||
|
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
|
||||||
|
&data_word(1,1,1,1);
|
||||||
|
&data_word(0x1b,0x1b,0x1b,0x1b);
|
||||||
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
|
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
|
||||||
|
|
||||||
&asm_finish();
|
&asm_finish();
|
||||||
|
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user