aes/asm/aesni-x86[_64].pl update.

This addresses

- request for improvement for faster key setup in RT#3576;
- clearing registers and stack in RT#3554 (this is more of a gesture to
see if there will be some traction from compiler side);
- more commentary around input parameters handling and stack layout
(desired when RT#3553 was reviewed);
- minor size and single block performance optimization (was lying around);

Reviewed-by: Matt Caswell <matt@openssl.org>
This commit is contained in:
Andy Polyakov 2015-01-23 17:27:10 +01:00
parent 313e6ec11f
commit 23f6eec71d
2 changed files with 1027 additions and 241 deletions

View File

@ -51,7 +51,7 @@
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91
# Haswell 4.44/0.80 0.97 1.03 0.72
# Atom 5.77/3.56 3.67 4.03 3.46
# Silvermont 5.77/3.56 3.67 4.03 3.46
# Bulldozer 5.80/0.98 1.05 1.24 0.93
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
@ -65,6 +65,9 @@ require "x86asm.pl";
&asm_init($ARGV[0],$0);
&external_label("OPENSSL_ia32cap_P");
&static_label("key_const");
if ($PREFIX eq "aesni") { $movekey=\&movups; }
else { $movekey=\&movups; }
@ -181,7 +184,10 @@ sub aesni_generate1 # fully unrolled loop
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&pxor ($rndkey0,$rndkey0); # clear register bank
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
&pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_encrypt");
@ -197,7 +203,10 @@ sub aesni_generate1 # fully unrolled loop
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&pxor ($rndkey0,$rndkey0); # clear register bank
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
&pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_decrypt");
@ -349,17 +358,15 @@ sub aesni_generate6
&neg ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key,$rounds));
&add ($rounds,16);
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jmp (&label("_aesni_${p}rypt6_enter"));
&jmp (&label("_aesni_${p}rypt6_inner"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&set_label("_aesni_${p}rypt6_inner");
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
@ -615,6 +622,14 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x30,$out),$inout3);
&set_label("ecb_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&function_end("aesni_ecb_encrypt");
######################################################################
@ -704,6 +719,15 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_encrypt_blocks");
&function_begin("aesni_ccm64_decrypt_blocks");
@ -804,6 +828,15 @@ if ($PREFIX eq "aesni") {
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_decrypt_blocks");
}
@ -1053,6 +1086,17 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
&pxor ("xmm5","xmm5");
&movdqa (&QWP(48,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(64,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&mov ("esp",&DWP(80,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");
@ -1394,6 +1438,20 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(-16,$out),$inout0); # write output
&set_label("xts_enc_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
&pxor ("xmm3","xmm3");
&movdqa (&QWP(16*1,"esp"),"xmm0");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(16*2,"esp"),"xmm0");
&pxor ("xmm5","xmm5");
&movdqa (&QWP(16*3,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(16*4,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_encrypt");
@ -1756,6 +1814,20 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
&pxor ("xmm3","xmm3");
&movdqa (&QWP(16*1,"esp"),"xmm0");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(16*2,"esp"),"xmm0");
&pxor ("xmm5","xmm5");
&movdqa (&QWP(16*3,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(16*4,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
@ -1808,6 +1880,7 @@ if ($PREFIX eq "aesni") {
&add ($len,16);
&jnz (&label("cbc_enc_tail"));
&movaps ($ivec,$inout0);
&pxor ($inout0,$inout0);
&jmp (&label("cbc_ret"));
&set_label("cbc_enc_tail");
@ -1871,7 +1944,7 @@ if ($PREFIX eq "aesni") {
&movaps ($inout0,$inout5);
&movaps ($ivec,$rndkey0);
&add ($len,0x50);
&jle (&label("cbc_dec_tail_collected"));
&jle (&label("cbc_dec_clear_tail_collected"));
&movups (&QWP(0,$out),$inout0);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_tail");
@ -1910,10 +1983,14 @@ if ($PREFIX eq "aesni") {
&xorps ($inout4,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&pxor ($inout1,$inout1);
&movups (&QWP(0x20,$out),$inout2);
&pxor ($inout2,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&pxor ($inout3,$inout3);
&lea ($out,&DWP(0x40,$out));
&movaps ($inout0,$inout4);
&pxor ($inout4,$inout4);
&sub ($len,0x50);
&jmp (&label("cbc_dec_tail_collected"));
@ -1933,6 +2010,7 @@ if ($PREFIX eq "aesni") {
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout1);
&pxor ($inout1,$inout1);
&lea ($out,&DWP(0x10,$out));
&movaps ($ivec,$in1);
&sub ($len,0x20);
@ -1945,7 +2023,9 @@ if ($PREFIX eq "aesni") {
&xorps ($inout2,$in1);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout2);
&pxor ($inout2,$inout2);
&movups (&QWP(0x10,$out),$inout1);
&pxor ($inout1,$inout1);
&lea ($out,&DWP(0x20,$out));
&movups ($ivec,&QWP(0x20,$inp));
&sub ($len,0x30);
@ -1961,29 +2041,44 @@ if ($PREFIX eq "aesni") {
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&pxor ($inout1,$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&pxor ($inout2,$inout2);
&lea ($out,&DWP(0x30,$out));
&movaps ($inout0,$inout3);
&pxor ($inout3,$inout3);
&sub ($len,0x40);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_clear_tail_collected",16);
&pxor ($inout1,$inout1);
&pxor ($inout2,$inout2);
&pxor ($inout3,$inout3);
&pxor ($inout4,$inout4);
&set_label("cbc_dec_tail_collected");
&and ($len,15);
&jnz (&label("cbc_dec_tail_partial"));
&movups (&QWP(0,$out),$inout0);
&pxor ($rndkey0,$rndkey0);
&jmp (&label("cbc_ret"));
&set_label("cbc_dec_tail_partial",16);
&movaps (&QWP(0,"esp"),$inout0);
&pxor ($rndkey0,$rndkey0);
&mov ("ecx",16);
&mov ($inp,"esp");
&sub ("ecx",$len);
&data_word(0xA4F3F689); # rep movsb
&movdqa (&QWP(0,"esp"),$inout0);
&set_label("cbc_ret");
&mov ("esp",&DWP(16,"esp")); # pull original %esp
&mov ($key_,&wparam(4));
&pxor ($inout0,$inout0);
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,$key_),$ivec); # output IV
&pxor ($ivec,$ivec);
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
@ -2000,14 +2095,24 @@ if ($PREFIX eq "aesni") {
# $round rounds
&function_begin_B("_aesni_set_encrypt_key");
&push ("ebp");
&push ("ebx");
&test ("eax","eax");
&jz (&label("bad_pointer"));
&test ($key,$key);
&jz (&label("bad_pointer"));
&call (&label("pic"));
&set_label("pic");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
&and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
&cmp ($rounds,192);
@ -2016,6 +2121,9 @@ if ($PREFIX eq "aesni") {
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
&cmp ("ebp",1<<28);
&je (&label("10rounds_alt"));
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
@ -2040,8 +2148,8 @@ if ($PREFIX eq "aesni") {
&call (&label("key_128"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(80,$key),$rounds);
&xor ("eax","eax");
&ret();
&jmp (&label("good_key"));
&set_label("key_128",16);
&$movekey (&QWP(0,$key),"xmm0");
@ -2055,8 +2163,76 @@ if ($PREFIX eq "aesni") {
&xorps ("xmm0","xmm1");
&ret();
&set_label("10rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&mov ($rounds,8);
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&movdqa ("xmm2","xmm0");
&movdqu (&DWP(-16,$key),"xmm0");
&set_label("loop_key128");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&lea ($key,&DWP(16,$key));
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(-16,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&dec ($rounds);
&jnz (&label("loop_key128"));
&movdqa ("xmm4",&QWP(0x30,"ebx"));
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(16,$key),"xmm0");
&mov ($rounds,9);
&mov (&DWP(96,$key),$rounds);
&jmp (&label("good_key"));
&set_label("12rounds",16);
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
&cmp ("ebp",1<<28);
&je (&label("12rounds_alt"));
&mov ($rounds,11);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
@ -2077,8 +2253,8 @@ if ($PREFIX eq "aesni") {
&call (&label("key_192b"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(48,$key),$rounds);
&xor ("eax","eax");
&ret();
&jmp (&label("good_key"));
&set_label("key_192a",16);
&$movekey (&QWP(0,$key),"xmm0");
@ -2108,10 +2284,52 @@ if ($PREFIX eq "aesni") {
&lea ($key,&DWP(32,$key));
&jmp (&label("key_192b_warm"));
&set_label("12rounds_alt",16);
&movdqa ("xmm5",&QWP(0x10,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,8);
&movdqu (&QWP(-16,$key),"xmm0");
&set_label("loop_key192");
&movq (&QWP(0,$key),"xmm2");
&movdqa ("xmm1","xmm2");
&pshufb ("xmm2","xmm5");
&aesenclast ("xmm2","xmm4");
&pslld ("xmm4",1);
&lea ($key,&DWP(24,$key));
&movdqa ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm0","xmm3");
&pshufd ("xmm3","xmm0",0xff);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pxor ("xmm0","xmm2");
&pxor ("xmm2","xmm3");
&movdqu (&QWP(-16,$key),"xmm0");
&dec ($rounds);
&jnz (&label("loop_key192"));
&mov ($rounds,11);
&mov (&DWP(32,$key),$rounds);
&jmp (&label("good_key"));
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&mov ($rounds,13);
&lea ($key,&DWP(16,$key));
&cmp ("ebp",1<<28);
&je (&label("14rounds_alt"));
&mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
@ -2143,7 +2361,8 @@ if ($PREFIX eq "aesni") {
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(16,$key),$rounds);
&xor ("eax","eax");
&ret();
&jmp (&label("good_key"));
&set_label("key_256a",16);
&$movekey (&QWP(0,$key),"xmm2");
@ -2169,11 +2388,77 @@ if ($PREFIX eq "aesni") {
&xorps ("xmm2","xmm1");
&ret();
&set_label("14rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,7);
&movdqu (&QWP(-32,$key),"xmm0");
&movdqa ("xmm1","xmm2");
&movdqu (&QWP(-16,$key),"xmm2");
&set_label("loop_key256");
&pshufb ("xmm2","xmm5");
&aesenclast ("xmm2","xmm4");
&movdqa ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm0","xmm3");
&pslld ("xmm4",1);
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&dec ($rounds);
&jz (&label("done_key256"));
&pshufd ("xmm2","xmm0",0xff);
&pxor ("xmm3","xmm3");
&aesenclast ("xmm2","xmm3");
&movdqa ("xmm3","xmm1")
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm1","xmm3");
&pxor ("xmm2","xmm1");
&movdqu (&QWP(16,$key),"xmm2");
&lea ($key,&DWP(32,$key));
&movdqa ("xmm1","xmm2");
&jmp (&label("loop_key256"));
&set_label("done_key256");
&mov ($rounds,13);
&mov (&DWP(16,$key),$rounds);
&set_label("good_key");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
&pop ("ebp");
&ret ();
&set_label("bad_pointer",4);
&mov ("eax",-1);
&pop ("ebx");
&pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
&pop ("ebp");
&ret ();
&function_end_B("_aesni_set_encrypt_key");
@ -2223,10 +2508,18 @@ if ($PREFIX eq "aesni") {
&aesimc ("xmm0","xmm0");
&$movekey (&QWP(0,$key),"xmm0");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&xor ("eax","eax"); # return success
&set_label("dec_key_ret");
&ret ();
&function_end_B("${PREFIX}_set_decrypt_key");
&set_label("key_const",64);
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
&data_word(1,1,1,1);
&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();

File diff suppressed because it is too large Load Diff