Switch to compact S-box when generating AES key schedule.
This commit is contained in:
parent
8cebec9802
commit
53154d71c3
@ -6,7 +6,7 @@
|
||||
# forms are granted according to the OpenSSL license.
|
||||
# ====================================================================
|
||||
#
|
||||
# Version 4.0.
|
||||
# Version 4.1.
|
||||
#
|
||||
# You might fail to appreciate this module performance from the first
|
||||
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
|
||||
@ -102,10 +102,12 @@
|
||||
# byte for 128-bit key.
|
||||
#
|
||||
# ECB encrypt ECB decrypt CBC large chunk
|
||||
# P4 57[60] 84[100] 23
|
||||
# P4 56[60] 84[100] 23
|
||||
# AMD K8 48[44] 70[79] 18
|
||||
# PIII 41[50] 61[91] 24
|
||||
# Pentium 120 160 77
|
||||
#
|
||||
# Version 4.1 switches to compact S-box even in key schedule setup.
|
||||
|
||||
push(@INC,"perlasm","../../perlasm");
|
||||
require "x86asm.pl";
|
||||
@ -263,56 +265,56 @@ sub enchoriz()
|
||||
# *all* references to stack, it's not faster...
|
||||
sub mmx_encbody()
|
||||
{
|
||||
&movz ("esi",&LB("eax")); # 0
|
||||
&mov ("ecx",&DWP(0,$tbl,"esi",8)); # 0
|
||||
&movz ($acc,&LB("eax")); # 0
|
||||
&mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
|
||||
&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
|
||||
&movz ("edx",&HB("eax")); # 1
|
||||
&mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
|
||||
&shr ("eax",16); # 5, 4
|
||||
|
||||
&movz ("esi",&LB("ebx")); # 10
|
||||
&xor ("ecx",&DWP(2,$tbl,"esi",8)); # 10
|
||||
&movz ($acc,&LB("ebx")); # 10
|
||||
&xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
|
||||
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
|
||||
&movz ("esi",&HB("ebx")); # 11
|
||||
&xor ("edx",&DWP(1,$tbl,"esi",8)); # 11
|
||||
&movz ($acc,&HB("ebx")); # 11
|
||||
&xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
|
||||
&shr ("ebx",16); # 15,14
|
||||
|
||||
&movz ("esi",&HB("eax")); # 5
|
||||
&xor ("ecx",&DWP(3,$tbl,"esi",8)); # 5
|
||||
&movz ($acc,&HB("eax")); # 5
|
||||
&xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
|
||||
&movq ("mm3",QWP(16,$key));
|
||||
&movz ("esi",&HB("ebx")); # 15
|
||||
&xor ("ecx",&DWP(1,$tbl,"esi",8)); # 15
|
||||
&movz ($acc,&HB("ebx")); # 15
|
||||
&xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
|
||||
&movz ("esi",&LB("eax")); # 4
|
||||
&mov ("ecx",&DWP(0,$tbl,"esi",8)); # 4
|
||||
&movz ($acc,&LB("eax")); # 4
|
||||
&mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
|
||||
&movd ("eax","mm2"); # 7, 6, 3, 2
|
||||
&movz ("esi",&LB("ebx")); # 14
|
||||
&xor ("ecx",&DWP(2,$tbl,"esi",8)); # 14
|
||||
&movz ($acc,&LB("ebx")); # 14
|
||||
&xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
|
||||
&movd ("ebx","mm6"); # 13,12, 9, 8
|
||||
|
||||
&movz ("esi",&HB("eax")); # 3
|
||||
&xor ("ecx",&DWP(1,$tbl,"esi",8)); # 3
|
||||
&movz ("esi",&HB("ebx")); # 9
|
||||
&xor ("ecx",&DWP(3,$tbl,"esi",8)); # 9
|
||||
&movz ($acc,&HB("eax")); # 3
|
||||
&xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
|
||||
&movz ($acc,&HB("ebx")); # 9
|
||||
&xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
|
||||
&movd ("mm1","ecx"); # t[1] collected
|
||||
|
||||
&movz ("esi",&LB("eax")); # 2
|
||||
&mov ("ecx",&DWP(2,$tbl,"esi",8)); # 2
|
||||
&movz ($acc,&LB("eax")); # 2
|
||||
&mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
|
||||
&shr ("eax",16); # 7, 6
|
||||
&punpckldq ("mm0","mm1"); # t[0,1] collected
|
||||
&movz ("esi",&LB("ebx")); # 8
|
||||
&xor ("ecx",&DWP(0,$tbl,"esi",8)); # 8
|
||||
&movz ($acc,&LB("ebx")); # 8
|
||||
&xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
|
||||
&shr ("ebx",16); # 13,12
|
||||
|
||||
&movz ("esi",&HB("eax")); # 7
|
||||
&xor ("ecx",&DWP(1,$tbl,"esi",8)); # 7
|
||||
&movz ($acc,&HB("eax")); # 7
|
||||
&xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
|
||||
&pxor ("mm0","mm3");
|
||||
&movz ("eax",&LB("eax")); # 6
|
||||
&xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
|
||||
&pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
|
||||
&movz ("esi",&HB("ebx")); # 13
|
||||
&xor ("ecx",&DWP(3,$tbl,"esi",8)); # 13
|
||||
&movz ($acc,&HB("ebx")); # 13
|
||||
&xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
|
||||
&xor ("ecx",&DWP(24,$key)); # t[2]
|
||||
&movd ("mm4","ecx"); # t[2] collected
|
||||
&movz ("ebx",&LB("ebx")); # 12
|
||||
@ -347,11 +349,11 @@ sub enccompact()
|
||||
&and ($out,0xFF);
|
||||
if ($i==1) { &shr ($s[0],16); }#%ebx[1]
|
||||
if ($i==2) { &shr ($s[0],24); }#%ecx[2]
|
||||
&movz ($out,&DWP(-128,$te,$out,1));
|
||||
&movz ($out,&BP(-128,$te,$out,1));
|
||||
|
||||
if ($i==3) { $tmp=$s[1]; }##%eax
|
||||
&movz ($tmp,&HB($s[1]));
|
||||
&movz ($tmp,&DWP(-128,$te,$tmp,1));
|
||||
&movz ($tmp,&BP(-128,$te,$tmp,1));
|
||||
&shl ($tmp,8);
|
||||
&xor ($out,$tmp);
|
||||
|
||||
@ -360,7 +362,7 @@ sub enccompact()
|
||||
&shr ($tmp,16); }
|
||||
if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
|
||||
&and ($tmp,0xFF);
|
||||
&movz ($tmp,&DWP(-128,$te,$tmp,1));
|
||||
&movz ($tmp,&BP(-128,$te,$tmp,1));
|
||||
&shl ($tmp,16);
|
||||
&xor ($out,$tmp);
|
||||
|
||||
@ -368,7 +370,7 @@ sub enccompact()
|
||||
elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
|
||||
else { &mov ($tmp,$s[3]);
|
||||
&shr ($tmp,24); }
|
||||
&movz ($tmp,&DWP(-128,$te,$tmp,1));
|
||||
&movz ($tmp,&BP(-128,$te,$tmp,1));
|
||||
&shl ($tmp,24);
|
||||
&xor ($out,$tmp);
|
||||
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
||||
@ -469,9 +471,9 @@ sub enctransform()
|
||||
#
|
||||
# Performance is not actually extraordinary in comparison to pure
|
||||
# x86 code. In particular encrypt performance is virtually the same.
|
||||
# same. Decrypt performance on the other hand is 15-20% better on
|
||||
# newer µ-archs [but we're thankful for *any* improvement here], and
|
||||
# ~50% better on PIII:-) And additionally on the pros side this code
|
||||
# Decrypt performance on the other hand is 15-20% better on newer
|
||||
# ľ-archs [but we're thankful for *any* improvement here], and ~50%
|
||||
# better on PIII:-) And additionally on the pros side this code
|
||||
# eliminates redundant references to stack and thus relieves/
|
||||
# minimizes the pressure on the memory bus.
|
||||
#
|
||||
@ -516,80 +518,80 @@ sub mmx_enccompact()
|
||||
&movd ("eax","mm1"); # 5, 4, 1, 0
|
||||
&movd ("ebx","mm5"); # 15,14,11,10
|
||||
|
||||
&movz ("esi",&LB("eax")); # 0
|
||||
&movz ("ecx",&DWP(-128,$tbl,"esi",1));# 0
|
||||
&movz ($acc,&LB("eax")); # 0
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
||||
&pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
|
||||
&movz ("edx",&HB("eax")); # 1
|
||||
&movz ("edx",&DWP(-128,$tbl,"edx",1));# 1
|
||||
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
|
||||
&shl ("edx",8); # 1
|
||||
&shr ("eax",16); # 5, 4
|
||||
|
||||
&movz ("esi",&LB("ebx")); # 10
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 10
|
||||
&shl ("esi",16); # 10
|
||||
&or ("ecx","esi"); # 10
|
||||
&movz ($acc,&LB("ebx")); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
|
||||
&shl ($acc,16); # 10
|
||||
&or ("ecx",$acc); # 10
|
||||
&pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
|
||||
&movz ("esi",&HB("ebx")); # 11
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 11
|
||||
&shl ("esi",24); # 11
|
||||
&or ("edx","esi"); # 11
|
||||
&movz ($acc,&HB("ebx")); # 11
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
|
||||
&shl ($acc,24); # 11
|
||||
&or ("edx",$acc); # 11
|
||||
&shr ("ebx",16); # 15,14
|
||||
|
||||
&movz ("esi",&HB("eax")); # 5
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 5
|
||||
&shl ("esi",8); # 5
|
||||
&or ("ecx","esi"); # 5
|
||||
&movz ("esi",&HB("ebx")); # 15
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 15
|
||||
&shl ("esi",24); # 15
|
||||
&or ("ecx","esi"); # 15
|
||||
&movz ($acc,&HB("eax")); # 5
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 5
|
||||
&shl ($acc,8); # 5
|
||||
&or ("ecx",$acc); # 5
|
||||
&movz ($acc,&HB("ebx")); # 15
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
|
||||
&shl ($acc,24); # 15
|
||||
&or ("ecx",$acc); # 15
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
|
||||
&movz ("esi",&LB("eax")); # 4
|
||||
&movz ("ecx",&DWP(-128,$tbl,"esi",1));# 4
|
||||
&movz ($acc,&LB("eax")); # 4
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 4
|
||||
&movd ("eax","mm2"); # 7, 6, 3, 2
|
||||
&movz ("esi",&LB("ebx")); # 14
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 14
|
||||
&shl ("esi",16); # 14
|
||||
&or ("ecx","esi"); # 14
|
||||
&movz ($acc,&LB("ebx")); # 14
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
|
||||
&shl ($acc,16); # 14
|
||||
&or ("ecx",$acc); # 14
|
||||
|
||||
&movd ("ebx","mm6"); # 13,12, 9, 8
|
||||
&movz ("esi",&HB("eax")); # 3
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 3
|
||||
&shl ("esi",24); # 3
|
||||
&or ("ecx","esi"); # 3
|
||||
&movz ("esi",&HB("ebx")); # 9
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 9
|
||||
&shl ("esi",8); # 9
|
||||
&or ("ecx","esi"); # 9
|
||||
&movz ($acc,&HB("eax")); # 3
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 3
|
||||
&shl ($acc,24); # 3
|
||||
&or ("ecx",$acc); # 3
|
||||
&movz ($acc,&HB("ebx")); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
|
||||
&shl ($acc,8); # 9
|
||||
&or ("ecx",$acc); # 9
|
||||
&movd ("mm1","ecx"); # t[1] collected
|
||||
|
||||
&movz ("esi",&LB("ebx")); # 8
|
||||
&movz ("ecx",&DWP(-128,$tbl,"esi",1));# 8
|
||||
&movz ($acc,&LB("ebx")); # 8
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 8
|
||||
&shr ("ebx",16); # 13,12
|
||||
&movz ("esi",&LB("eax")); # 2
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 2
|
||||
&shl ("esi",16); # 2
|
||||
&or ("ecx","esi"); # 2
|
||||
&movz ($acc,&LB("eax")); # 2
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
|
||||
&shl ($acc,16); # 2
|
||||
&or ("ecx",$acc); # 2
|
||||
&shr ("eax",16); # 7, 6
|
||||
|
||||
&punpckldq ("mm0","mm1"); # t[0,1] collected
|
||||
|
||||
&movz ("esi",&HB("eax")); # 7
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 7
|
||||
&shl ("esi",24); # 7
|
||||
&or ("ecx","esi"); # 7
|
||||
&movz ($acc,&HB("eax")); # 7
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
|
||||
&shl ($acc,24); # 7
|
||||
&or ("ecx",$acc); # 7
|
||||
&and ("eax",0xff); # 6
|
||||
&movz ("eax",&DWP(-128,$tbl,"eax",1));# 6
|
||||
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
|
||||
&shl ("eax",16); # 6
|
||||
&or ("edx","eax"); # 6
|
||||
&movz ("esi",&HB("ebx")); # 13
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 13
|
||||
&shl ("esi",8); # 13
|
||||
&or ("ecx","esi"); # 13
|
||||
&movz ($acc,&HB("ebx")); # 13
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
|
||||
&shl ($acc,8); # 13
|
||||
&or ("ecx",$acc); # 13
|
||||
&movd ("mm4","ecx"); # t[2] collected
|
||||
&and ("ebx",0xff); # 12
|
||||
&movz ("ebx",&DWP(-128,$tbl,"ebx",1));# 12
|
||||
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
|
||||
&or ("edx","ebx"); # 12
|
||||
&movd ("mm5","edx"); # t[3] collected
|
||||
|
||||
@ -632,24 +634,22 @@ sub mmx_enccompact()
|
||||
&movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
|
||||
&pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
|
||||
&pand ("mm3","mm2"); &pand ("mm7","mm2");
|
||||
&movq ("mm2","mm0"); &movq ("mm6","mm4"); # r0
|
||||
&pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
|
||||
&paddb ("mm0","mm0"); &paddb ("mm4","mm4");
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
|
||||
&movq ("mm3","mm2"); &movq ("mm7","mm6");
|
||||
&pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r2^r0
|
||||
&pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
|
||||
&pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
|
||||
&pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
|
||||
|
||||
&movq ("mm2","mm3"); &movq ("mm6","mm7");
|
||||
&pslld ("mm3",8); &pslld ("mm7",8);
|
||||
&psrld ("mm2",16); &psrld ("mm6",16);
|
||||
&psrld ("mm2",24); &psrld ("mm6",24);
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
|
||||
&pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>16
|
||||
&pslld ("mm3",8); &pslld ("mm7",8);
|
||||
&psrld ("mm2",8); &psrld ("mm6",8);
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<16
|
||||
&movq ("mm3","mm1"); &movq ("mm7","mm5");
|
||||
&pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
|
||||
|
||||
&psrld ("mm1",8); &psrld ("mm5",8);
|
||||
&movq ("mm3","mm1"); &movq ("mm7","mm5");
|
||||
&movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
|
||||
&psrld ("mm1",8); &psrld ("mm5",8);
|
||||
&pslld ("mm3",24); &pslld ("mm7",24);
|
||||
&pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
|
||||
&pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
|
||||
@ -1105,7 +1105,7 @@ sub enclast()
|
||||
&mov (&DWP(4,$acc),$s1);
|
||||
&mov (&DWP(8,$acc),$s2);
|
||||
&mov (&DWP(12,$acc),$s3);
|
||||
&jmp (&label("ret"));
|
||||
&function_end_A();
|
||||
|
||||
&set_label("mmx",16);
|
||||
&movq ("mm0",&QWP(0,$acc));
|
||||
@ -1116,8 +1116,6 @@ sub enclast()
|
||||
&movq (&QWP(0,$acc),"mm0"); # write output data
|
||||
&movq (&QWP(8,$acc),"mm4");
|
||||
&emms ();
|
||||
|
||||
&set_label("ret",4);
|
||||
&function_end("AES_encrypt");
|
||||
|
||||
#--------------------------------------------------------------------#
|
||||
@ -1140,11 +1138,11 @@ sub deccompact()
|
||||
if($i==3) { &$Fn ($key,&DWP(20,"esp")); }
|
||||
else { &mov ($out,$s[0]); }
|
||||
&and ($out,0xFF);
|
||||
&movz ($out,&DWP(-128,$td,$out,1));
|
||||
&movz ($out,&BP(-128,$td,$out,1));
|
||||
|
||||
if ($i==3) { $tmp=$s[1]; }
|
||||
&movz ($tmp,&HB($s[1]));
|
||||
&movz ($tmp,&DWP(-128,$td,$tmp,1));
|
||||
&movz ($tmp,&BP(-128,$td,$tmp,1));
|
||||
&shl ($tmp,8);
|
||||
&xor ($out,$tmp);
|
||||
|
||||
@ -1152,14 +1150,14 @@ sub deccompact()
|
||||
else { mov ($tmp,$s[2]); }
|
||||
&shr ($tmp,16);
|
||||
&and ($tmp,0xFF);
|
||||
&movz ($tmp,&DWP(-128,$td,$tmp,1));
|
||||
&movz ($tmp,&BP(-128,$td,$tmp,1));
|
||||
&shl ($tmp,16);
|
||||
&xor ($out,$tmp);
|
||||
|
||||
if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],&DWP(8,"esp")); }
|
||||
else { &mov ($tmp,$s[3]); }
|
||||
&shr ($tmp,24);
|
||||
&movz ($tmp,&DWP(-128,$td,$tmp,1));
|
||||
&movz ($tmp,&BP(-128,$td,$tmp,1));
|
||||
&shl ($tmp,24);
|
||||
&xor ($out,$tmp);
|
||||
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
||||
@ -1301,80 +1299,80 @@ sub mmx_deccompact()
|
||||
&movd ("eax","mm1"); # 7, 6, 1, 0
|
||||
|
||||
&pshufw ("mm5","mm4",0x09); # 13,12,11,10
|
||||
&movz ("esi",&LB("eax")); # 0
|
||||
&movz ("ecx",&DWP(-128,$tbl,"esi",1));# 0
|
||||
&movz ($acc,&LB("eax")); # 0
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
|
||||
&movd ("ebx","mm5"); # 13,12,11,10
|
||||
&movz ("edx",&HB("eax")); # 1
|
||||
&movz ("edx",&DWP(-128,$tbl,"edx",1));# 1
|
||||
&movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
|
||||
&shl ("edx",8); # 1
|
||||
|
||||
&pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
|
||||
&movz ("esi",&LB("ebx")); # 10
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 10
|
||||
&shl ("esi",16); # 10
|
||||
&or ("ecx","esi"); # 10
|
||||
&movz ($acc,&LB("ebx")); # 10
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 10
|
||||
&shl ($acc,16); # 10
|
||||
&or ("ecx",$acc); # 10
|
||||
&shr ("eax",16); # 7, 6
|
||||
&movz ("esi",&HB("ebx")); # 11
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 11
|
||||
&shl ("esi",24); # 11
|
||||
&or ("edx","esi"); # 11
|
||||
&movz ($acc,&HB("ebx")); # 11
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 11
|
||||
&shl ($acc,24); # 11
|
||||
&or ("edx",$acc); # 11
|
||||
&shr ("ebx",16); # 13,12
|
||||
|
||||
&pshufw ("mm6","mm4",0x03); # 9, 8,15,14
|
||||
&movz ("esi",&HB("eax")); # 7
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 7
|
||||
&shl ("esi",24); # 7
|
||||
&or ("ecx","esi"); # 7
|
||||
&movz ("esi",&HB("ebx")); # 13
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 13
|
||||
&shl ("esi",8); # 13
|
||||
&or ("ecx","esi"); # 13
|
||||
&movz ($acc,&HB("eax")); # 7
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 7
|
||||
&shl ($acc,24); # 7
|
||||
&or ("ecx",$acc); # 7
|
||||
&movz ($acc,&HB("ebx")); # 13
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 13
|
||||
&shl ($acc,8); # 13
|
||||
&or ("ecx",$acc); # 13
|
||||
&movd ("mm0","ecx"); # t[0] collected
|
||||
|
||||
&movz ("esi",&LB("eax")); # 6
|
||||
&movz ($acc,&LB("eax")); # 6
|
||||
&movd ("eax","mm2"); # 3, 2, 5, 4
|
||||
&movz ("ecx",&DWP(-128,$tbl,"esi",1));# 6
|
||||
&movz ("ecx",&BP(-128,$tbl,$acc,1)); # 6
|
||||
&shl ("ecx",16); # 6
|
||||
&movz ("esi",&LB("ebx")); # 12
|
||||
&movz ($acc,&LB("ebx")); # 12
|
||||
&movd ("ebx","mm6"); # 9, 8,15,14
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 12
|
||||
&or ("ecx","esi"); # 12
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 12
|
||||
&or ("ecx",$acc); # 12
|
||||
|
||||
&movz ("esi",&LB("eax")); # 4
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 4
|
||||
&or ("edx","esi"); # 4
|
||||
&movz ("esi",&LB("ebx")); # 14
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 14
|
||||
&shl ("esi",16); # 14
|
||||
&or ("edx","esi"); # 14
|
||||
&movz ($acc,&LB("eax")); # 4
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 4
|
||||
&or ("edx",$acc); # 4
|
||||
&movz ($acc,&LB("ebx")); # 14
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 14
|
||||
&shl ($acc,16); # 14
|
||||
&or ("edx",$acc); # 14
|
||||
&movd ("mm1","edx"); # t[1] collected
|
||||
|
||||
&movz ("esi",&HB("eax")); # 5
|
||||
&movz ("edx",&DWP(-128,$tbl,"esi",1));# 5
|
||||
&movz ($acc,&HB("eax")); # 5
|
||||
&movz ("edx",&BP(-128,$tbl,$acc,1)); # 5
|
||||
&shl ("edx",8); # 5
|
||||
&movz ("esi",&HB("ebx")); # 15
|
||||
&movz ($acc,&HB("ebx")); # 15
|
||||
&shr ("eax",16); # 3, 2
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 15
|
||||
&shl ("esi",24); # 15
|
||||
&or ("edx","esi"); # 15
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 15
|
||||
&shl ($acc,24); # 15
|
||||
&or ("edx",$acc); # 15
|
||||
&shr ("ebx",16); # 9, 8
|
||||
|
||||
&punpckldq ("mm0","mm1"); # t[0,1] collected
|
||||
|
||||
&movz ("esi",&HB("ebx")); # 9
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 9
|
||||
&shl ("esi",8); # 9
|
||||
&or ("ecx","esi"); # 9
|
||||
&movz ($acc,&HB("ebx")); # 9
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 9
|
||||
&shl ($acc,8); # 9
|
||||
&or ("ecx",$acc); # 9
|
||||
&and ("ebx",0xff); # 8
|
||||
&movz ("ebx",&DWP(-128,$tbl,"ebx",1));# 8
|
||||
&movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
|
||||
&or ("edx","ebx"); # 8
|
||||
&movz ("esi",&LB("eax")); # 2
|
||||
&movz ("esi",&DWP(-128,$tbl,"esi",1));# 2
|
||||
&shl ("esi",16); # 2
|
||||
&or ("edx","esi"); # 2
|
||||
&movz ($acc,&LB("eax")); # 2
|
||||
&movz ($acc,&BP(-128,$tbl,$acc,1)); # 2
|
||||
&shl ($acc,16); # 2
|
||||
&or ("edx",$acc); # 2
|
||||
&movd ("mm4","edx"); # t[2] collected
|
||||
&movz ("eax",&HB("eax")); # 3
|
||||
&movz ("eax",&DWP(-128,$tbl,"eax",1));# 3
|
||||
&movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
|
||||
&shl ("eax",24); # 3
|
||||
&or ("ecx","eax"); # 3
|
||||
&movd ("mm5","ecx"); # t[3] collected
|
||||
@ -1407,7 +1405,6 @@ sub mmx_deccompact()
|
||||
&mov ($s2,&DWP(192-128,$tbl));
|
||||
&mov ($s3,&DWP(224-128,$tbl));
|
||||
|
||||
&align (4);
|
||||
&set_label("loop",16);
|
||||
&mmx_deccompact();
|
||||
&add ($key,16);
|
||||
@ -1536,11 +1533,11 @@ sub declast()
|
||||
if($i==3) { &mov ($key,&DWP(20,"esp")); }
|
||||
else { &mov ($out,$s[0]); }
|
||||
&and ($out,0xFF);
|
||||
&movz ($out,&DWP(0,$td,$out,1));
|
||||
&movz ($out,&BP(0,$td,$out,1));
|
||||
|
||||
if ($i==3) { $tmp=$s[1]; }
|
||||
&movz ($tmp,&HB($s[1]));
|
||||
&movz ($tmp,&DWP(0,$td,$tmp,1));
|
||||
&movz ($tmp,&BP(0,$td,$tmp,1));
|
||||
&shl ($tmp,8);
|
||||
&xor ($out,$tmp);
|
||||
|
||||
@ -1548,14 +1545,14 @@ sub declast()
|
||||
else { mov ($tmp,$s[2]); }
|
||||
&shr ($tmp,16);
|
||||
&and ($tmp,0xFF);
|
||||
&movz ($tmp,&DWP(0,$td,$tmp,1));
|
||||
&movz ($tmp,&BP(0,$td,$tmp,1));
|
||||
&shl ($tmp,16);
|
||||
&xor ($out,$tmp);
|
||||
|
||||
if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
|
||||
else { &mov ($tmp,$s[3]); }
|
||||
&shr ($tmp,24);
|
||||
&movz ($tmp,&DWP(0,$td,$tmp,1));
|
||||
&movz ($tmp,&BP(0,$td,$tmp,1));
|
||||
&shl ($tmp,24);
|
||||
&xor ($out,$tmp);
|
||||
if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
|
||||
@ -1895,7 +1892,7 @@ sub declast()
|
||||
&mov (&DWP(4,$acc),$s1);
|
||||
&mov (&DWP(8,$acc),$s2);
|
||||
&mov (&DWP(12,$acc),$s3);
|
||||
&jmp (&label("ret"));
|
||||
&function_end_A();
|
||||
|
||||
&set_label("mmx",16);
|
||||
&movq ("mm0",&QWP(0,$acc));
|
||||
@ -1906,8 +1903,6 @@ sub declast()
|
||||
&movq (&QWP(0,$acc),"mm0"); # write output data
|
||||
&movq (&QWP(8,$acc),"mm4");
|
||||
&emms ();
|
||||
|
||||
&set_label("ret",4);
|
||||
&function_end("AES_decrypt");
|
||||
|
||||
# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
|
||||
@ -2357,27 +2352,26 @@ my $mark=&DWP(72+240,"esp"); #copy of aes_key->rounds
|
||||
sub enckey()
|
||||
{
|
||||
&movz ("esi",&LB("edx")); # rk[i]>>0
|
||||
&mov ("ebx",&DWP(2,$tbl,"esi",8));
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&movz ("esi",&HB("edx")); # rk[i]>>8
|
||||
&and ("ebx",0xFF000000);
|
||||
&shl ("ebx",24);
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov ("ebx",&DWP(2,$tbl,"esi",8));
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&shr ("edx",16);
|
||||
&and ("ebx",0x000000FF);
|
||||
&movz ("esi",&LB("edx")); # rk[i]>>16
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov ("ebx",&DWP(0,$tbl,"esi",8));
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&movz ("esi",&HB("edx")); # rk[i]>>24
|
||||
&and ("ebx",0x0000FF00);
|
||||
&shl ("ebx",8);
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov ("ebx",&DWP(0,$tbl,"esi",8));
|
||||
&and ("ebx",0x00FF0000);
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&shl ("ebx",16);
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&xor ("eax",&DWP(2048+1024,$tbl,"ecx",4)); # rcon
|
||||
&xor ("eax",&BP(1024-128,$tbl,"ecx",4)); # rcon
|
||||
}
|
||||
|
||||
# int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
|
||||
@ -2396,6 +2390,17 @@ sub enckey()
|
||||
&set_label("pic_point");
|
||||
&blindpop($tbl);
|
||||
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
|
||||
&lea ($tbl,&DWP(2048+128,$tbl));
|
||||
|
||||
# prefetch Te4
|
||||
&mov ("eax",&DWP(0-128,$tbl));
|
||||
&mov ("ebx",&DWP(32-128,$tbl));
|
||||
&mov ("ecx",&DWP(64-128,$tbl));
|
||||
&mov ("edx",&DWP(96-128,$tbl));
|
||||
&mov ("eax",&DWP(128-128,$tbl));
|
||||
&mov ("ebx",&DWP(160-128,$tbl));
|
||||
&mov ("ecx",&DWP(192-128,$tbl));
|
||||
&mov ("edx",&DWP(224-128,$tbl));
|
||||
|
||||
&mov ("ecx",&wparam(1)); # number of bits in key
|
||||
&cmp ("ecx",128);
|
||||
@ -2536,24 +2541,23 @@ sub enckey()
|
||||
&mov ("edx","eax");
|
||||
&mov ("eax",&DWP(16,"edi")); # rk[4]
|
||||
&movz ("esi",&LB("edx")); # rk[11]>>0
|
||||
&mov ("ebx",&DWP(2,$tbl,"esi",8));
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&movz ("esi",&HB("edx")); # rk[11]>>8
|
||||
&and ("ebx",0x000000FF);
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov ("ebx",&DWP(0,$tbl,"esi",8));
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&shr ("edx",16);
|
||||
&and ("ebx",0x0000FF00);
|
||||
&shl ("ebx",8);
|
||||
&movz ("esi",&LB("edx")); # rk[11]>>16
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov ("ebx",&DWP(0,$tbl,"esi",8));
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&movz ("esi",&HB("edx")); # rk[11]>>24
|
||||
&and ("ebx",0x00FF0000);
|
||||
&shl ("ebx",16);
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov ("ebx",&DWP(2,$tbl,"esi",8));
|
||||
&and ("ebx",0xFF000000);
|
||||
&movz ("ebx",&BP(-128,$tbl,"esi",1));
|
||||
&shl ("ebx",24);
|
||||
&xor ("eax","ebx");
|
||||
|
||||
&mov (&DWP(48,"edi"),"eax"); # rk[12]
|
||||
@ -2578,24 +2582,61 @@ sub enckey()
|
||||
&function_end("AES_set_encrypt_key");
|
||||
|
||||
sub deckey()
|
||||
{ my ($i,$ptr,$te,$td) = @_;
|
||||
{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
|
||||
my $tmp = $tbl;
|
||||
|
||||
&mov ("eax",&DWP($i,$ptr));
|
||||
&mov ("edx","eax");
|
||||
&movz ("ebx",&HB("eax"));
|
||||
&shr ("edx",16);
|
||||
&and ("eax",0xFF);
|
||||
&movz ("eax",&BP(2,$te,"eax",8));
|
||||
&movz ("ebx",&BP(2,$te,"ebx",8));
|
||||
&mov ("eax",&DWP(0,$td,"eax",8));
|
||||
&xor ("eax",&DWP(3,$td,"ebx",8));
|
||||
&movz ("ebx",&HB("edx"));
|
||||
&and ("edx",0xFF);
|
||||
&movz ("edx",&BP(2,$te,"edx",8));
|
||||
&movz ("ebx",&BP(2,$te,"ebx",8));
|
||||
&xor ("eax",&DWP(2,$td,"edx",8));
|
||||
&xor ("eax",&DWP(1,$td,"ebx",8));
|
||||
&mov (&DWP($i,$ptr),"eax");
|
||||
&mov ($acc,$tp1);
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&mov ($tp2,$tp1);
|
||||
&shr ($tmp,7);
|
||||
&and ($tp2,0x7f7f7f7f);
|
||||
&sub ($acc,$tmp);
|
||||
&add ($tp2,$tp2);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&xor ($acc,$tp2);
|
||||
&mov ($tp2,$acc);
|
||||
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&mov ($tp4,$tp2);
|
||||
&xor ($tp2,$tp1); # tp2^tp1
|
||||
&shr ($tmp,7);
|
||||
&and ($tp4,0x7f7f7f7f);
|
||||
&sub ($acc,$tmp);
|
||||
&add ($tp4,$tp4);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&xor ($acc,$tp4);
|
||||
&mov ($tp4,$acc);
|
||||
|
||||
&and ($acc,0x80808080);
|
||||
&mov ($tmp,$acc);
|
||||
&mov ($tp8,$tp4);
|
||||
&xor ($tp4,$tp1); # tp4^tp1
|
||||
&shr ($tmp,7);
|
||||
&and ($tp8,0x7f7f7f7f);
|
||||
&sub ($acc,$tmp);
|
||||
&add ($tp8,$tp8);
|
||||
&and ($acc,0x1b1b1b1b);
|
||||
&rotl ($tp1,8); # = ROTATE(tp1,8)
|
||||
&xor ($tp8,$acc);
|
||||
|
||||
&mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
|
||||
|
||||
&xor ($tp1,$tp2);
|
||||
&xor ($tp2,$tp8);
|
||||
&xor ($tp1,$tp4);
|
||||
&rotl ($tp2,24);
|
||||
&xor ($tp4,$tp8);
|
||||
&xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
|
||||
&rotl ($tp4,16);
|
||||
&xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
|
||||
&rotl ($tp8,8);
|
||||
&xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
|
||||
&mov ($tp2,$tmp);
|
||||
&xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
|
||||
|
||||
&mov (&DWP(4*$i,$key),$tp1);
|
||||
}
|
||||
|
||||
# int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
|
||||
@ -2627,8 +2668,7 @@ sub deckey()
|
||||
&lea ("ecx",&DWP(0,"","ecx",4));
|
||||
&lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
|
||||
|
||||
&align (4);
|
||||
&set_label("invert"); # invert order of chunks
|
||||
&set_label("invert",4); # invert order of chunks
|
||||
&mov ("eax",&DWP(0,"esi"));
|
||||
&mov ("ebx",&DWP(4,"esi"));
|
||||
&mov ("ecx",&DWP(0,"edi"));
|
||||
@ -2650,24 +2690,21 @@ sub deckey()
|
||||
&cmp ("esi","edi");
|
||||
&jne (&label("invert"));
|
||||
|
||||
&call (&label("pic_point"));
|
||||
&set_label("pic_point");
|
||||
blindpop($tbl);
|
||||
&lea ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
|
||||
&lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
|
||||
&mov ($key,&wparam(2));
|
||||
&mov ($acc,&DWP(240,$key)); # pull number of rounds
|
||||
&lea ($acc,&DWP(-2,$acc,$acc));
|
||||
&lea ($acc,&DWP(0,$key,$acc,8));
|
||||
&mov (&wparam(2),$acc);
|
||||
|
||||
&mov ("esi",&wparam(2));
|
||||
&mov ("ecx",&DWP(240,"esi")); # pull number of rounds
|
||||
&dec ("ecx");
|
||||
&align (4);
|
||||
&set_label("permute"); # permute the key schedule
|
||||
&add ("esi",16);
|
||||
&deckey (0,"esi",$tbl,"edi");
|
||||
&deckey (4,"esi",$tbl,"edi");
|
||||
&deckey (8,"esi",$tbl,"edi");
|
||||
&deckey (12,"esi",$tbl,"edi");
|
||||
&dec ("ecx");
|
||||
&jnz (&label("permute"));
|
||||
&mov ($s0,&DWP(16,$key)); # modulo-scheduled load
|
||||
&set_label("permute",4); # permute the key schedule
|
||||
&add ($key,16);
|
||||
&deckey (0,$key,$s0,$s1,$s2,$s3);
|
||||
&deckey (1,$key,$s1,$s2,$s3,$s0);
|
||||
&deckey (2,$key,$s2,$s3,$s0,$s1);
|
||||
&deckey (3,$key,$s3,$s0,$s1,$s2);
|
||||
&cmp ($key,&wparam(2));
|
||||
&jb (&label("permute"));
|
||||
|
||||
&xor ("eax","eax"); # return success
|
||||
&function_end("AES_set_decrypt_key");
|
||||
|
Loading…
Reference in New Issue
Block a user