Avoid aliasing between stack frames and S-boxes. Compress prefetch code.

This commit is contained in:
Andy Polyakov 2005-04-22 11:49:32 +00:00
parent 00df894701
commit 04d0d0accf

View File

@ -6,7 +6,7 @@
# forms are granted according to the OpenSSL license. # forms are granted according to the OpenSSL license.
# ==================================================================== # ====================================================================
# #
# Version 3.2. # Version 3.3.
# #
# You might fail to appreciate this module performance from the first # You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@ -104,9 +104,9 @@ sub encvert()
my $v0 = $acc, $v1 = $key; my $v0 = $acc, $v1 = $key;
&mov ($v0,$s[3]); # copy s3 &mov ($v0,$s[3]); # copy s3
&mov (&DWP(0,"esp"),$s[2]); # save s2 &mov (&DWP(4,"esp"),$s[2]); # save s2
&mov ($v1,$s[0]); # copy s0 &mov ($v1,$s[0]); # copy s0
&mov (&DWP(4,"esp"),$s[1]); # save s1 &mov (&DWP(8,"esp"),$s[1]); # save s1
&movz ($s[2],&HB($s[0])); &movz ($s[2],&HB($s[0]));
&and ($s[0],0xFF); &and ($s[0],0xFF);
@ -127,7 +127,7 @@ sub encvert()
&movz ($v0,&HB($v1)); &movz ($v0,&HB($v1));
&and ($v1,0xFF); &and ($v1,0xFF);
&xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16 &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
&mov ($v1,&DWP(0,"esp")); # restore s2 &mov ($v1,&DWP(4,"esp")); # restore s2
&xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24 &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
&mov ($v0,$v1); &mov ($v0,$v1);
@ -139,7 +139,7 @@ sub encvert()
&movz ($v1,&HB($v0)); &movz ($v1,&HB($v0));
&and ($v0,0xFF); &and ($v0,0xFF);
&xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16 &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
&mov ($v0,&DWP(4,"esp")); # restore s1 &mov ($v0,&DWP(8,"esp")); # restore s1
&xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24 &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
&mov ($v1,$v0); &mov ($v1,$v0);
@ -172,19 +172,19 @@ sub encstep()
&movz ($tmp,&HB($s[1])); &movz ($tmp,&HB($s[1]));
&xor ($out,&DWP(3,$te,$tmp,8)); &xor ($out,&DWP(3,$te,$tmp,8));
if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }##%ebx if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
else { &mov ($tmp,$s[2]); else { &mov ($tmp,$s[2]);
&shr ($tmp,16); } &shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2] if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
&and ($tmp,0xFF); &and ($tmp,0xFF);
&xor ($out,&DWP(2,$te,$tmp,8)); &xor ($out,&DWP(2,$te,$tmp,8));
if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }##%ecx if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]); else { &mov ($tmp,$s[3]);
&shr ($tmp,24) } &shr ($tmp,24) }
&xor ($out,&DWP(1,$te,$tmp,8)); &xor ($out,&DWP(1,$te,$tmp,8));
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); } if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],$acc); } if ($i==3) { &mov ($s[3],$acc); }
&comment(); &comment();
} }
@ -208,7 +208,7 @@ sub enclast()
&and ($tmp,0x0000ff00); &and ($tmp,0x0000ff00);
&xor ($out,$tmp); &xor ($out,$tmp);
if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }##%ebx if ($i==3) { $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
else { mov ($tmp,$s[2]); else { mov ($tmp,$s[2]);
&shr ($tmp,16); } &shr ($tmp,16); }
if ($i==2) { &and ($s[1],0xFF); }#%edx[2] if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
@ -217,14 +217,14 @@ sub enclast()
&and ($tmp,0x00ff0000); &and ($tmp,0x00ff0000);
&xor ($out,$tmp); &xor ($out,$tmp);
if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }##%ecx if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
else { &mov ($tmp,$s[3]); else { &mov ($tmp,$s[3]);
&shr ($tmp,24); } &shr ($tmp,24); }
&mov ($tmp,&DWP(2,$te,$tmp,8)); &mov ($tmp,&DWP(2,$te,$tmp,8));
&and ($tmp,0xff000000); &and ($tmp,0xff000000);
&xor ($out,$tmp); &xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); } if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],$acc); } if ($i==3) { &mov ($s[3],$acc); }
} }
@ -238,13 +238,8 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
&mov ($s2="esi",$acc="ecx"); &mov ($s2="esi",$acc="ecx");
} }
# allocate aligned stack frame # note that caller is expected to allocate stack frame for me!
&mov ($acc,"esp");
&sub ("esp",20);
&and ("esp",-16);
&mov (&DWP(12,"esp"),$key); # save key &mov (&DWP(12,"esp"),$key); # save key
&mov (&DWP(16,"esp"),$acc); # save %esp
&xor ($s0,&DWP(0,$key)); # xor with key &xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
@ -256,7 +251,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
if ($small_footprint) { if ($small_footprint) {
&lea ($acc,&DWP(-2,$acc,$acc)); &lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8)); &lea ($acc,&DWP(0,$key,$acc,8));
&mov (&DWP(8,"esp"),$acc); # end of key schedule &mov (&DWP(16,"esp"),$acc); # end of key schedule
&align (4); &align (4);
&set_label("loop"); &set_label("loop");
if ($vertical_spin) { if ($vertical_spin) {
@ -272,7 +267,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key)); &xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key)); &xor ($s3,&DWP(12,$key));
&cmp ($key,&DWP(8,"esp")); &cmp ($key,&DWP(16,"esp"));
&mov (&DWP(12,"esp"),$key); &mov (&DWP(12,"esp"),$key);
&jb (&label("loop")); &jb (&label("loop"));
} }
@ -343,7 +338,6 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
&enclast(2,"ebp",$s2,$s3,$s0,$s1); &enclast(2,"ebp",$s2,$s3,$s0,$s1);
&enclast(3,"ebp",$s3,$s0,$s1,$s2); &enclast(3,"ebp",$s3,$s0,$s1,$s2);
&mov ("esp",&DWP(16,"esp")); # restore %esp
&add ($key,$small_footprint?16:160); &add ($key,$small_footprint?16:160);
&xor ($s0,&DWP(0,$key)); &xor ($s0,&DWP(0,$key));
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
@ -429,6 +423,12 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
&mov ($acc,&wparam(0)); # load inp &mov ($acc,&wparam(0)); # load inp
&mov ($key,&wparam(2)); # load key &mov ($key,&wparam(2)); # load key
&mov ($s0,"esp");
&sub ("esp",24);
&and ("esp",-64);
&add ("esp",4);
&mov (&DWP(16,"esp"),$s0);
&call (&label("pic_point")); # make it PIC! &call (&label("pic_point")); # make it PIC!
&set_label("pic_point"); &set_label("pic_point");
&blindpop("ebp"); &blindpop("ebp");
@ -441,6 +441,8 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
&call ("_x86_AES_encrypt"); &call ("_x86_AES_encrypt");
&mov ("esp",&DWP(16,"esp"));
&mov ($acc,&wparam(1)); # load out &mov ($acc,&wparam(1)); # load out
&mov (&DWP(0,$acc),$s0); # write output data &mov (&DWP(0,$acc),$s0); # write output data
&mov (&DWP(4,$acc),$s1); &mov (&DWP(4,$acc),$s1);
@ -474,12 +476,12 @@ sub decstep()
&and ($tmp,0xFF); &and ($tmp,0xFF);
&xor ($out,&DWP(2,$td,$tmp,8)); &xor ($out,&DWP(2,$td,$tmp,8));
if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); } if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
else { &mov ($tmp,$s[3]); } else { &mov ($tmp,$s[3]); }
&shr ($tmp,24); &shr ($tmp,24);
&xor ($out,&DWP(1,$td,$tmp,8)); &xor ($out,&DWP(1,$td,$tmp,8));
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); } if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],&DWP(0,"esp")); } if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
&comment(); &comment();
} }
@ -508,25 +510,20 @@ sub declast()
&and ($tmp,0x00ff0000); &and ($tmp,0x00ff0000);
&xor ($out,$tmp); &xor ($out,$tmp);
if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); } if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
else { &mov ($tmp,$s[3]); } else { &mov ($tmp,$s[3]); }
&shr ($tmp,24); &shr ($tmp,24);
&mov ($tmp,&DWP(2048,$td,$tmp,4)); &mov ($tmp,&DWP(2048,$td,$tmp,4));
&and ($tmp,0xff000000); &and ($tmp,0xff000000);
&xor ($out,$tmp); &xor ($out,$tmp);
if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); } if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
if ($i==3) { &mov ($s[3],&DWP(0,"esp")); } if ($i==3) { &mov ($s[3],&DWP(4,"esp")); }
} }
&public_label("AES_Td"); &public_label("AES_Td");
&function_begin_B("_x86_AES_decrypt"); &function_begin_B("_x86_AES_decrypt");
# allocate aligned stack frame # note that caller is expected to allocate stack frame for me!
&mov ($acc,"esp");
&sub ("esp",20);
&and ("esp",-16);
&mov (&DWP(12,"esp"),$key); # save key &mov (&DWP(12,"esp"),$key); # save key
&mov (&DWP(16,"esp"),$acc); # save %esp
&xor ($s0,&DWP(0,$key)); # xor with key &xor ($s0,&DWP(0,$key)); # xor with key
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
@ -538,7 +535,7 @@ sub declast()
if ($small_footprint) { if ($small_footprint) {
&lea ($acc,&DWP(-2,$acc,$acc)); &lea ($acc,&DWP(-2,$acc,$acc));
&lea ($acc,&DWP(0,$key,$acc,8)); &lea ($acc,&DWP(0,$key,$acc,8));
&mov (&DWP(8,"esp"),$acc); # end of key schedule &mov (&DWP(16,"esp"),$acc); # end of key schedule
&align (4); &align (4);
&set_label("loop"); &set_label("loop");
&decstep(0,"ebp",$s0,$s3,$s2,$s1); &decstep(0,"ebp",$s0,$s3,$s2,$s1);
@ -550,7 +547,7 @@ sub declast()
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key)); &xor ($s2,&DWP(8,$key));
&xor ($s3,&DWP(12,$key)); &xor ($s3,&DWP(12,$key));
&cmp ($key,&DWP(8,"esp")); &cmp ($key,&DWP(16,"esp"));
&mov (&DWP(12,"esp"),$key); &mov (&DWP(12,"esp"),$key);
&jb (&label("loop")); &jb (&label("loop"));
} }
@ -604,7 +601,6 @@ sub declast()
&declast(2,"ebp",$s2,$s1,$s0,$s3); &declast(2,"ebp",$s2,$s1,$s0,$s3);
&declast(3,"ebp",$s3,$s2,$s1,$s0); &declast(3,"ebp",$s3,$s2,$s1,$s0);
&mov ("esp",&DWP(16,"esp")); # restore %esp
&add ($key,$small_footprint?16:160); &add ($key,$small_footprint?16:160);
&xor ($s0,&DWP(0,$key)); &xor ($s0,&DWP(0,$key));
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
@ -751,6 +747,12 @@ sub declast()
&mov ($acc,&wparam(0)); # load inp &mov ($acc,&wparam(0)); # load inp
&mov ($key,&wparam(2)); # load key &mov ($key,&wparam(2)); # load key
&mov ($s0,"esp");
&sub ("esp",24);
&and ("esp",-64);
&add ("esp",4);
&mov (&DWP(16,"esp"),$s0);
&call (&label("pic_point")); # make it PIC! &call (&label("pic_point")); # make it PIC!
&set_label("pic_point"); &set_label("pic_point");
&blindpop("ebp"); &blindpop("ebp");
@ -763,6 +765,8 @@ sub declast()
&call ("_x86_AES_decrypt"); &call ("_x86_AES_decrypt");
&mov ("esp",&DWP(16,"esp"));
&mov ($acc,&wparam(1)); # load out &mov ($acc,&wparam(1)); # load out
&mov (&DWP(0,$acc),$s0); # write output data &mov (&DWP(0,$acc),$s0); # write output data
&mov (&DWP(4,$acc),$s1); &mov (&DWP(4,$acc),$s1);
@ -773,6 +777,22 @@ sub declast()
# void AES_cbc_encrypt (const void char *inp, unsigned char *out, # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
# size_t length, const AES_KEY *key, # size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc); # unsigned char *ivp,const int enc);
{
# stack frame layout
# -4(%esp) 0(%esp) return address
# 0(%esp) 4(%esp) tmp1
# 4(%esp) 8(%esp) tmp2
# 8(%esp) 12(%esp) key
# 12(%esp) 16(%esp) end of key schedule
my $_esp=&DWP(16,"esp"); #saved %esp
my $_inp=&DWP(20,"esp"); #copy of wparam(0)
my $_out=&DWP(24,"esp"); #copy of wparam(1)
my $_len=&DWP(28,"esp"); #copy of wparam(2)
my $_key=&DWP(32,"esp"); #copy of wparam(3)
my $_ivp=&DWP(36,"esp"); #copy of wparam(4)
my $_tmp=&DWP(40,"esp"); #volatile variable
my $ivec=&DWP(44,"esp"); #ivec[16]
&public_label("AES_Te"); &public_label("AES_Te");
&public_label("AES_Td"); &public_label("AES_Td");
&function_begin("AES_cbc_encrypt"); &function_begin("AES_cbc_encrypt");
@ -789,20 +809,58 @@ sub declast()
&lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
&mov ($acc,&wparam(0)); # load inp # allocate aligned stack frame...
&mov ($key,&wparam(4)); # load ivp &lea ($key,&DWP(-44,"esp"));
&and ($key,-64);
# ... and make sure it doesn't alias with AES_Te modulo 4096
&mov ($s1,"ebp");
&mov ($s3,$key);
&and ($s1,0xfff); # t = %ebp&0xfff
&and ($s3,0xfff); # p = %esp&0xfff
&cmp ($s3,$s1); # if (p<t) goto ok
&jb (&label("te_ok"));
&lea ($acc,&DWP(2048,$s1));
&cmp ($s3,$acc); # if (p>=(t+2048)) goto ok
&jae (&label("te_ok"));
&sub ($s1,$s3); # t -= p
&lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64
&set_label("te_ok");
&mov ($s0,&wparam(0)); # load inp
&mov ($s1,&wparam(1)); # load out
&mov ($s3,&wparam(3)); # load key
&mov ($acc,&wparam(4)); # load ivp
&exch ("esp",$key);
&add ("esp",4); # reserve for return address!
&mov ($_esp,$key); # save %esp
&mov ($_inp,$s0); # save copy of inp
&mov ($_out,$s1); # save copy of out
&mov ($_len,$s2); # save copy of len
&mov ($_key,$s3); # save copy of key
&mov ($_ivp,$acc); # save copy of ivp
&mov ($acc,$s0);
&mov ($key,16);
&align (4);
&set_label("prefetch_te");
&mov ($s0,&DWP(0,"ebp"));
&mov ($s1,&DWP(32,"ebp"));
&mov ($s2,&DWP(64,"ebp"));
&mov ($s3,&DWP(96,"ebp"));
&lea ("ebp",&DWP(128,"ebp"));
&dec ($key);
&jnz (&label("prefetch_te"));
&sub ("ebp",2048);
&mov ($s2,$_len);
&mov ($key,$_ivp);
&test ($s2,0xFFFFFFF0); &test ($s2,0xFFFFFFF0);
&jz (&label("enc_tail")); # short input... &jz (&label("enc_tail")); # short input...
# prefetch AES_Te
for ($i=0;$i<2048;$i+=128)
{ &mov ($s0,&DWP($i+0,"ebp"));
&mov ($s1,&DWP($i+32,"ebp"));
&mov ($s2,&DWP($i+64,"ebp"));
&mov ($s3,&DWP($i+96,"ebp"));
}
&mov ($s0,&DWP(0,$key)); # load iv &mov ($s0,&DWP(0,$key)); # load iv
&mov ($s1,&DWP(4,$key)); &mov ($s1,&DWP(4,$key));
@ -816,38 +874,39 @@ sub declast()
&xor ($s2,&DWP(8,$acc)); &xor ($s2,&DWP(8,$acc));
&xor ($s3,&DWP(12,$acc)); &xor ($s3,&DWP(12,$acc));
&mov ($key,&wparam(3)); # load key &mov ($key,$_key); # load key
&call ("_x86_AES_encrypt"); &call ("_x86_AES_encrypt");
&mov ($acc,&wparam(0)); # load inp &mov ($acc,$_inp); # load inp
&mov ($key,&wparam(1)); # load out &mov ($key,$_out); # load out
&mov (&DWP(0,$key),$s0); # save output data &mov (&DWP(0,$key),$s0); # save output data
&mov (&DWP(4,$key),$s1); &mov (&DWP(4,$key),$s1);
&mov (&DWP(8,$key),$s2); &mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3); &mov (&DWP(12,$key),$s3);
&mov ($s2,&wparam(2)); # load len &mov ($s2,$_len); # load len
&lea ($acc,&DWP(16,$acc)); &lea ($acc,&DWP(16,$acc));
&mov (&wparam(0),$acc); # save inp &mov ($_inp,$acc); # save inp
&lea ($s3,&DWP(16,$key)); &lea ($s3,&DWP(16,$key));
&mov (&wparam(1),$s3); # save out &mov ($_out,$s3); # save out
&sub ($s2,16); &sub ($s2,16);
&test ($s2,0xFFFFFFF0); &test ($s2,0xFFFFFFF0);
&mov (&wparam(2),$s2); # save len &mov ($_len,$s2); # save len
&jnz (&label("enc_loop")); &jnz (&label("enc_loop"));
&test ($s2,15); &test ($s2,15);
&jnz (&label("enc_tail")); &jnz (&label("enc_tail"));
&mov ($acc,&wparam(4)); # load ivp &mov ($acc,$_ivp); # load ivp
&mov ($s2,&DWP(8,$key)); # restore last dwords &mov ($s2,&DWP(8,$key)); # restore last dwords
&mov ($s3,&DWP(12,$key)); &mov ($s3,&DWP(12,$key));
&mov (&DWP(0,$acc),$s0); # save iv &mov (&DWP(0,$acc),$s0); # save ivec
&mov (&DWP(4,$acc),$s1); &mov (&DWP(4,$acc),$s1);
&mov (&DWP(8,$acc),$s2); &mov (&DWP(8,$acc),$s2);
&mov (&DWP(12,$acc),$s3); &mov (&DWP(12,$acc),$s3);
&mov ("esp",$_esp);
&set_label("enc_out"); &set_label("enc_out");
&function_end_A(); &function_end_A();
@ -855,7 +914,7 @@ sub declast()
&set_label("enc_tail"); &set_label("enc_tail");
&push ($key eq "edi" ? $key : ""); # push ivp &push ($key eq "edi" ? $key : ""); # push ivp
&pushf (); &pushf ();
&mov ($key,&wparam(1)); # load out &mov ($key,$_out); # load out
&mov ($s1,16); &mov ($s1,16);
&sub ($s1,$s2); &sub ($s1,$s2);
&cmp ($key,$acc); # compare with inp &cmp ($key,$acc); # compare with inp
@ -871,41 +930,69 @@ sub declast()
&popf (); &popf ();
&pop ($key); # pop ivp &pop ($key); # pop ivp
# prefetch AES_Te &mov ($acc,$_out); # output as input
for ($i=0;$i<2048;$i+=128)
{ &mov ($s0,&DWP($i+0,"ebp"));
&mov ($s1,&DWP($i+32,"ebp"));
&mov ($s2,&DWP($i+64,"ebp"));
&mov ($s3,&DWP($i+96,"ebp"));
}
&mov ($acc,&wparam(1)); # output as input
&mov ($s0,&DWP(0,$key)); &mov ($s0,&DWP(0,$key));
&mov ($s1,&DWP(4,$key)); &mov ($s1,&DWP(4,$key));
&mov (&wparam(2),16); # len=16 &mov ($_len,16); # len=16
&jmp (&label("enc_loop")); # one more spin... &jmp (&label("enc_loop")); # one more spin...
#----------------------------- DECRYPT -----------------------------# #----------------------------- DECRYPT -----------------------------#
&align (4); &align (4);
&set_label("DECRYPT"); &set_label("DECRYPT");
&stack_push(5); # allocate temp + ivp
&lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
# prefetch AES_Td # allocate aligned stack frame...
for ($i=0;$i<3072;$i+=128) &lea ($key,&DWP(-64,"esp"));
{ &mov ($s0,&DWP($i+0,"ebp")); &and ($key,-64);
&mov ($s1,&DWP($i+32,"ebp"));
&mov ($s2,&DWP($i+64,"ebp"));
&mov ($s3,&DWP($i+96,"ebp"));
}
&mov ($acc,&wparam(0)); # load inp # ... and make sure it doesn't alias with AES_Td modulo 4096
&cmp ($acc,&wparam(1)); &mov ($s1,"ebp");
&mov ($s3,$key);
&and ($s1,0xfff); # t = %ebp&0xfff
&and ($s3,0xfff); # p = %esp&0xfff
&cmp ($s3,$s1); # if (p<t) goto ok
&jb (&label("td_ok"));
&lea ($acc,&DWP(3072,$s1));
&cmp ($s3,$acc); # if (p>=(t+3072)) goto ok
&jae (&label("td_ok"));
&sub ($s1,$s3); # t -= p
&lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64
&set_label("td_ok");
&mov ($s0,&wparam(0)); # load inp
&mov ($s1,&wparam(1)); # load out
&mov ($s3,&wparam(3)); # load key
&mov ($acc,&wparam(4)); # load ivp
&exch ("esp",$key);
&add ("esp",4); # reserve for return address!
&mov ($_esp,$key); # save %esp
&mov ($_inp,$s0); # save copy of inp
&mov ($_out,$s1); # save copy of out
&mov ($_len,$s2); # save copy of len
&mov ($_key,$s3); # save copy of key
&mov ($_ivp,$acc); # save copy of ivp
&mov ($acc,$s0);
&mov ($key,24);
&align (4);
&set_label("prefetch_td");
&mov ($s0,&DWP(0,"ebp"));
&mov ($s1,&DWP(32,"ebp"));
&mov ($s2,&DWP(64,"ebp"));
&mov ($s3,&DWP(96,"ebp"));
&lea ("ebp",&DWP(128,"ebp"));
&dec ($key);
&jnz (&label("prefetch_td"));
&sub ("ebp",3072);
&cmp ($acc,$_out);
&je (&label("dec_in_place")); # in-place processing... &je (&label("dec_in_place")); # in-place processing...
&mov ($key,&wparam(4)); # load ivp &mov ($key,$_ivp); # load ivp
&mov (&swtmp(4),$key); &mov ($_tmp,$key);
&align (4); &align (4);
&set_label("dec_loop"); &set_label("dec_loop");
@ -914,11 +1001,11 @@ sub declast()
&mov ($s2,&DWP(8,$acc)); &mov ($s2,&DWP(8,$acc));
&mov ($s3,&DWP(12,$acc)); &mov ($s3,&DWP(12,$acc));
&mov ($key,&wparam(3)); # load key &mov ($key,$_key); # load key
&call ("_x86_AES_decrypt"); &call ("_x86_AES_decrypt");
&mov ($key,&swtmp(4)); # load ivp &mov ($key,$_tmp); # load ivp
&mov ($acc,&wparam(2)); # load len &mov ($acc,$_len); # load len
&xor ($s0,&DWP(0,$key)); # xor iv &xor ($s0,&DWP(0,$key)); # xor iv
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key)); &xor ($s2,&DWP(8,$key));
@ -926,26 +1013,26 @@ sub declast()
&sub ($acc,16); &sub ($acc,16);
&jc (&label("dec_partial")); &jc (&label("dec_partial"));
&mov (&wparam(2),$acc); # save len &mov ($_len,$acc); # save len
&mov ($acc,&wparam(0)); # load inp &mov ($acc,$_inp); # load inp
&mov ($key,&wparam(1)); # load out &mov ($key,$_out); # load out
&mov (&DWP(0,$key),$s0); # write output &mov (&DWP(0,$key),$s0); # write output
&mov (&DWP(4,$key),$s1); &mov (&DWP(4,$key),$s1);
&mov (&DWP(8,$key),$s2); &mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3); &mov (&DWP(12,$key),$s3);
&mov (&swtmp(4),$acc); # save ivp &mov ($_tmp,$acc); # save ivp
&lea ($acc,&DWP(16,$acc)); &lea ($acc,&DWP(16,$acc));
&mov (&wparam(0),$acc); # save inp &mov ($_inp,$acc); # save inp
&lea ($key,&DWP(16,$key)); &lea ($key,&DWP(16,$key));
&mov (&wparam(1),$key); # save out &mov ($_out,$key); # save out
&jnz (&label("dec_loop")); &jnz (&label("dec_loop"));
&mov ($key,&swtmp(4)); # load temp ivp &mov ($key,$_tmp); # load temp ivp
&set_label("dec_end"); &set_label("dec_end");
&mov ($acc,&wparam(4)); # load user ivp &mov ($acc,$_ivp); # load user ivp
&mov ($s0,&DWP(0,$key)); # load iv &mov ($s0,&DWP(0,$key)); # load iv
&mov ($s1,&DWP(4,$key)); &mov ($s1,&DWP(4,$key));
&mov ($s2,&DWP(8,$key)); &mov ($s2,&DWP(8,$key));
@ -958,24 +1045,24 @@ sub declast()
&align (4); &align (4);
&set_label("dec_partial"); &set_label("dec_partial");
&lea ($key,&swtmp(0)); &lea ($key,$ivec);
&mov (&DWP(0,$key),$s0); # dump output to stack &mov (&DWP(0,$key),$s0); # dump output to stack
&mov (&DWP(4,$key),$s1); &mov (&DWP(4,$key),$s1);
&mov (&DWP(8,$key),$s2); &mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3); &mov (&DWP(12,$key),$s3);
&lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
&mov ($acc eq "esi" ? $acc : "",$key); &mov ($acc eq "esi" ? $acc : "",$key);
&mov ($key eq "edi" ? $key : "",&wparam(1)); &mov ($key eq "edi" ? $key : "",$_out); # load out
&pushf (); &pushf ();
&data_word(0x90A4F3FC); # cld; rep movsb; nop # copy output &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy output
&popf (); &popf ();
&mov ($key,&wparam(0)); # load temp ivp &mov ($key,$_inp); # use inp as temp ivp
&jmp (&label("dec_end")); &jmp (&label("dec_end"));
&align (4); &align (4);
&set_label("dec_in_place"); &set_label("dec_in_place");
&set_label("dec_in_place_loop"); &set_label("dec_in_place_loop");
&lea ($key,&swtmp(0)); &lea ($key,$ivec);
&mov ($s0,&DWP(0,$acc)); # read input &mov ($s0,&DWP(0,$acc)); # read input
&mov ($s1,&DWP(4,$acc)); &mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc)); &mov ($s2,&DWP(8,$acc));
@ -986,11 +1073,11 @@ sub declast()
&mov (&DWP(8,$key),$s2); &mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3); &mov (&DWP(12,$key),$s3);
&mov ($key,&wparam(3)); # load key &mov ($key,$_key); # load key
&call ("_x86_AES_decrypt"); &call ("_x86_AES_decrypt");
&mov ($key,&wparam(4)); # load ivp &mov ($key,$_ivp); # load ivp
&mov ($acc,&wparam(1)); # load out &mov ($acc,$_out); # load out
&xor ($s0,&DWP(0,$key)); # xor iv &xor ($s0,&DWP(0,$key)); # xor iv
&xor ($s1,&DWP(4,$key)); &xor ($s1,&DWP(4,$key));
&xor ($s2,&DWP(8,$key)); &xor ($s2,&DWP(8,$key));
@ -1002,9 +1089,9 @@ sub declast()
&mov (&DWP(12,$acc),$s3); &mov (&DWP(12,$acc),$s3);
&lea ($acc,&DWP(16,$acc)); &lea ($acc,&DWP(16,$acc));
&mov (&wparam(1),$acc); # save out &mov ($_out,$acc); # save out
&lea ($acc,&swtmp(0)); &lea ($acc,$ivec);
&mov ($s0,&DWP(0,$acc)); # read temp &mov ($s0,&DWP(0,$acc)); # read temp
&mov ($s1,&DWP(4,$acc)); &mov ($s1,&DWP(4,$acc));
&mov ($s2,&DWP(8,$acc)); &mov ($s2,&DWP(8,$acc));
@ -1015,23 +1102,23 @@ sub declast()
&mov (&DWP(8,$key),$s2); &mov (&DWP(8,$key),$s2);
&mov (&DWP(12,$key),$s3); &mov (&DWP(12,$key),$s3);
&mov ($acc,&wparam(0)); # load inp &mov ($acc,$_inp); # load inp
&lea ($acc,&DWP(16,$acc)); &lea ($acc,&DWP(16,$acc));
&mov (&wparam(0),$acc); # save inp &mov ($_inp,$acc); # save inp
&mov ($s2,&wparam(2)); # load len &mov ($s2,$_len); # load len
&sub ($s2,16); &sub ($s2,16);
&jc (&label("dec_in_place_partial")); &jc (&label("dec_in_place_partial"));
&mov (&wparam(2),$s2); # save len &mov ($_len,$s2); # save len
&jnz (&label("dec_in_place_loop")); &jnz (&label("dec_in_place_loop"));
&jmp (&label("dec_out")); &jmp (&label("dec_out"));
&align (4); &align (4);
&set_label("dec_in_place_partial"); &set_label("dec_in_place_partial");
# one can argue if this is actually required... # one can argue if this is actually required...
&mov ($key eq "edi" ? $key : "",&wparam(1)); &mov ($key eq "edi" ? $key : "",$_out);
&lea ($acc eq "esi" ? $acc : "",&swtmp(0)); &lea ($acc eq "esi" ? $acc : "",$ivec);
&lea ($key,&DWP(0,$key,$s2)); &lea ($key,&DWP(0,$key,$s2));
&lea ($acc,&DWP(16,$acc,$s2)); &lea ($acc,&DWP(16,$acc,$s2));
&neg ($s2 eq "ecx" ? $s2 : ""); &neg ($s2 eq "ecx" ? $s2 : "");
@ -1041,8 +1128,9 @@ sub declast()
&align (4); &align (4);
&set_label("dec_out"); &set_label("dec_out");
&stack_pop(5); &mov ("esp",$_esp);
&function_end("AES_cbc_encrypt"); &function_end("AES_cbc_encrypt");
}
#------------------------------------------------------------------# #------------------------------------------------------------------#