~15% better AES x86_64 assembler.

This commit is contained in:
Andy Polyakov 2005-07-18 09:15:04 +00:00
parent f42e6d24f2
commit afbe674edb
2 changed files with 526 additions and 180 deletions

View File

@ -6,9 +6,9 @@
# forms are granted according to the OpenSSL license. # forms are granted according to the OpenSSL license.
# ==================================================================== # ====================================================================
# #
# Version 1.0. # Version 1.1.
# #
# aes-*-cbc benchmarks are improved by 50% [compared to gcc 3.3.2 on # aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version # Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
# [you'll notice a lot of resemblance], such as compressed S-boxes # [you'll notice a lot of resemblance], such as compressed S-boxes
# in little-endian byte order, prefetch of these tables in CBC mode, # in little-endian byte order, prefetch of these tables in CBC mode,
@ -18,14 +18,14 @@
# Performance in number of cycles per processed byte for 128-bit key: # Performance in number of cycles per processed byte for 128-bit key:
# #
# ECB CBC encrypt # ECB CBC encrypt
# AMD64 15.6 14.6(*) # AMD64 13.7 13.0(*)
# EM64T 23.3(**) 21.4(*) # EM64T 20.2 18.6(*)
# #
# (*) CBC benchmarks are better than ECB thanks to custom ABI used # (*) CBC benchmarks are better than ECB thanks to custom ABI used
# by the private block encryption function. # by the private block encryption function.
# (**) This module exhibits virtually same ECB performance as 32-bit
# counterpart on [current] Intel CPU.
$verticalspin=1; # unlike 32-bit version $verticalspin performs
# ~15% better on both AMD and Intel cores
$output=shift; $output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
@ -35,20 +35,22 @@ $s0="%eax";
$s1="%ebx"; $s1="%ebx";
$s2="%ecx"; $s2="%ecx";
$s3="%edx"; $s3="%edx";
$inp="%rdi"; $acc0="%esi";
$out="%rsi"; $acc1="%edi";
$acc0="%ebp"; $acc2="%ebp";
$acc1="%r8d"; $inp="%r8";
$acc2="%r9d"; $out="%r9";
$t0="%r10d"; $t0="%r10d";
$t1="%r11d"; $t1="%r11d";
$t2="%r12d"; $t2="%r12d";
$cnt="%r13d"; $rnds="%r13d";
$tbl="%r14"; $sbox="%r14";
$key="%r15"; $key="%r15";
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; } sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r; } sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
$r =~ s/%[er]([sd]i)/%\1l/;
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
sub _data_word() sub _data_word()
{ my $i; { my $i;
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; } while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@ -61,6 +63,169 @@ sub data_word()
$code.=sprintf"0x%08x\n",$last; $code.=sprintf"0x%08x\n",$last;
} }
sub encvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
movzb `&lo($qs3)`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi($qs3)`,$acc0
shr \$16,$s2
movzb `&hi($qs0)`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s0
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
movzb `&lo($qs1)`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
mov 12($key),$s3
movzb `&hi($qs1)`,$acc1
movzb `&hi($qs2)`,$acc2
mov 0($key),$s0
xor 1($sbox,$acc1,8),$t2
xor 1($sbox,$acc2,8),$t3
mov 4($key),$s1
mov 8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub enclastvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 2($sbox,$acc0,8),$t0
mov 2($sbox,$acc1,8),$t1
mov 2($sbox,$acc2,8),$t2
and \$0x000000ff,$t0
and \$0x000000ff,$t1
and \$0x000000ff,$t2
movzb `&lo($qs3)`,$acc0
movzb `&hi($qs1)`,$acc1
movzb `&hi($qs2)`,$acc2
mov 2($sbox,$acc0,8),$t3
mov 0($sbox,$acc1,8),$acc1 #$t0
mov 0($sbox,$acc2,8),$acc2 #$t1
and \$0x000000ff,$t3
and \$0x0000ff00,$acc1
and \$0x0000ff00,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s2
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
shr \$16,$s3
mov 0($sbox,$acc0,8),$acc0 #$t2
mov 0($sbox,$acc1,8),$acc1 #$t3
and \$0x0000ff00,$acc0
and \$0x0000ff00,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s0
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t0
mov 0($sbox,$acc1,8),$acc1 #$t1
mov 0($sbox,$acc2,8),$acc2 #$t2
and \$0x00ff0000,$acc0
and \$0x00ff0000,$acc1
and \$0x00ff0000,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo($qs1)`,$acc0
movzb `&hi($qs3)`,$acc1
movzb `&hi($qs0)`,$acc2
mov 0($sbox,$acc0,8),$acc0 #$t3
mov 2($sbox,$acc1,8),$acc1 #$t0
mov 2($sbox,$acc2,8),$acc2 #$t1
and \$0x00ff0000,$acc0
and \$0xff000000,$acc1
and \$0xff000000,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
mov 16+12($key),$s3
mov 2($sbox,$acc0,8),$acc0 #$t2
mov 2($sbox,$acc1,8),$acc1 #$t3
mov 16+0($key),$s0
and \$0xff000000,$acc0
and \$0xff000000,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub encstep() sub encstep()
{ my ($i,@s) = @_; { my ($i,@s) = @_;
my $tmp0=$acc0; my $tmp0=$acc0;
@ -68,24 +233,28 @@ sub encstep()
my $tmp2=$acc2; my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i]; my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3); if ($i==3) {
$tmp1=$s[2] if ($i==3); $tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n"; $code.=" lea 16($key),$key\n" if ($i==0);
$code.=" mov 0($tbl,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n"; $code.=" mov 0($sbox,$out,8),$out\n";
$code.=" shr \$24,$tmp2\n";
$code.=" xor 3($tbl,$tmp0,8),$out\n"; $code.=" shr \$16,$tmp1\n";
$code.=" xor 2($tbl,$tmp1,8),$out\n"; $code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" xor 1($tbl,$tmp2,8),$out\n"; $code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" shr \$24,$tmp2\n";
$code.=" xor 4*$i($key),$out\n";
$code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t0,$s[1]\n" if ($i==3); $code.=" mov $t0,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" mov $t1,$s[2]\n" if ($i==3);
@ -100,25 +269,26 @@ sub enclast()
my $tmp2=$acc2; my $tmp2=$acc2;
my $out=($t0,$t1,$t2,$s[0])[$i]; my $out=($t0,$t1,$t2,$s[0])[$i];
$code.=" mov $s[0],$out\n" if ($i!=3); if ($i==3) {
$tmp1=$s[2] if ($i==3); $tmp0=$s[1];
$tmp1=$s[2];
$tmp2=$s[3];
}
$code.=" movzb ".&lo($s[0]).",$out\n";
$code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n";
$code.=" mov 2($tbl,$out,8),$out\n"; $code.=" mov 2($sbox,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n"; $code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" mov $s[3],$tmp2\n" if ($i!=3);
$code.=" and \$0x000000ff,$out\n"; $code.=" and \$0x000000ff,$out\n";
$tmp0=$s[1] if ($i==3);
$code.=" movzb ".&hi($s[1]).",$tmp0\n"; $code.=" movzb ".&hi($s[1]).",$tmp0\n";
$code.=" and \$0xFF,$tmp1\n"; $code.=" movzb ".&lo($tmp1).",$tmp1\n";
$code.=" shr \$24,$tmp2\n"; $code.=" shr \$24,$tmp2\n";
$code.=" mov 0($tbl,$tmp0,8),$tmp0\n"; $code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
$code.=" mov 0($tbl,$tmp1,8),$tmp1\n"; $code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
$code.=" mov 2($tbl,$tmp2,8),$tmp2\n"; $code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
$code.=" and \$0x0000ff00,$tmp0\n"; $code.=" and \$0x0000ff00,$tmp0\n";
$code.=" and \$0x00ff0000,$tmp1\n"; $code.=" and \$0x00ff0000,$tmp1\n";
@ -142,36 +312,35 @@ _x86_64_AES_encrypt:
xor 8($key),$s2 xor 8($key),$s2
xor 12($key),$s3 xor 12($key),$s3
mov 240($key),$cnt # load key->rounds mov 240($key),$rnds # load key->rounds
sub \$1,$cnt sub \$1,$rnds
.align 4 jmp .Lenc_loop
.align 16
.Lenc_loop: .Lenc_loop:
___ ___
&encstep(0,$s0,$s1,$s2,$s3); if ($verticalspin) { &encvert(); }
&encstep(1,$s1,$s2,$s3,$s0); else { &encstep(0,$s0,$s1,$s2,$s3);
&encstep(2,$s2,$s3,$s0,$s1); &encstep(1,$s1,$s2,$s3,$s0);
&encstep(3,$s3,$s0,$s1,$s2); &encstep(2,$s2,$s3,$s0,$s1);
&encstep(3,$s3,$s0,$s1,$s2);
}
$code.=<<___; $code.=<<___;
lea 16($key),$key sub \$1,$rnds
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
sub \$1,$cnt
jnz .Lenc_loop jnz .Lenc_loop
___ ___
&enclast(0,$s0,$s1,$s2,$s3); if ($verticalspin) { &enclastvert(); }
&enclast(1,$s1,$s2,$s3,$s0); else { &enclast(0,$s0,$s1,$s2,$s3);
&enclast(2,$s2,$s3,$s0,$s1); &enclast(1,$s1,$s2,$s3,$s0);
&enclast(3,$s3,$s0,$s1,$s2); &enclast(2,$s2,$s3,$s0,$s1);
&enclast(3,$s3,$s0,$s1,$s2);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 16+4($key),$s1
xor 16+8($key),$s2
xor 16+12($key),$s3
___
}
$code.=<<___; $code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
.byte 0xf3,0xc3 # rep ret .byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt .size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
___ ___
@ -190,9 +359,11 @@ AES_encrypt:
push %r15 push %r15
mov %rdx,$key mov %rdx,$key
mov %rdi,$inp
mov %rsi,$out
.picmeup $tbl .picmeup $sbox
lea AES_Te-.($tbl),$tbl lea AES_Te-.($sbox),$sbox
mov 0($inp),$s0 mov 0($inp),$s0
mov 4($inp),$s1 mov 4($inp),$s1
@ -218,6 +389,169 @@ ___
#------------------------------------------------------------------# #------------------------------------------------------------------#
sub decvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
# favor 3-way issue Opteron pipeline...
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 0($sbox,$acc0,8),$t0
mov 0($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t2
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
movzb `&lo($qs3)`,$acc2
xor 3($sbox,$acc0,8),$t0
xor 3($sbox,$acc1,8),$t1
mov 0($sbox,$acc2,8),$t3
movzb `&hi($qs1)`,$acc0
shr \$16,$s0
movzb `&hi($qs2)`,$acc2
xor 3($sbox,$acc0,8),$t2
shr \$16,$s3
xor 3($sbox,$acc2,8),$t3
shr \$16,$s1
lea 16($key),$key
shr \$16,$s2
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
xor 2($sbox,$acc0,8),$t0
xor 2($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t2
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
movzb `&lo($qs1)`,$acc2
xor 1($sbox,$acc0,8),$t0
xor 1($sbox,$acc1,8),$t1
xor 2($sbox,$acc2,8),$t3
movzb `&hi($qs3)`,$acc0
mov 12($key),$s3
movzb `&hi($qs0)`,$acc2
xor 1($sbox,$acc0,8),$t2
mov 0($key),$s0
xor 1($sbox,$acc2,8),$t3
xor $t0,$s0
mov 4($key),$s1
mov 8($key),$s2
xor $t2,$s2
xor $t1,$s1
xor $t3,$s3
___
}
sub declastvert()
{ my $t3="%r8d"; # zaps $inp!
my $qs0='"$s0"';
my $qs1='"$s1"';
my $qs2='"$s2"';
my $qs3='"$s3"';
$code.=<<___;
movzb `&lo($qs0)`,$acc0
movzb `&lo($qs1)`,$acc1
movzb `&lo($qs2)`,$acc2
mov 2048($sbox,$acc0,4),$t0
mov 2048($sbox,$acc1,4),$t1
mov 2048($sbox,$acc2,4),$t2
and \$0x000000ff,$t0
and \$0x000000ff,$t1
and \$0x000000ff,$t2
movzb `&lo($qs3)`,$acc0
movzb `&hi($qs3)`,$acc1
movzb `&hi($qs0)`,$acc2
mov 2048($sbox,$acc0,4),$t3
mov 2048($sbox,$acc1,4),$acc1 #$t0
mov 2048($sbox,$acc2,4),$acc2 #$t1
and \$0x000000ff,$t3
and \$0x0000ff00,$acc1
and \$0x0000ff00,$acc2
xor $acc1,$t0
xor $acc2,$t1
shr \$16,$s3
movzb `&hi($qs1)`,$acc0
movzb `&hi($qs2)`,$acc1
shr \$16,$s0
mov 2048($sbox,$acc0,4),$acc0 #$t2
mov 2048($sbox,$acc1,4),$acc1 #$t3
and \$0x0000ff00,$acc0
and \$0x0000ff00,$acc1
shr \$16,$s1
xor $acc0,$t2
xor $acc1,$t3
shr \$16,$s2
movzb `&lo($qs2)`,$acc0
movzb `&lo($qs3)`,$acc1
movzb `&lo($qs0)`,$acc2
mov 2048($sbox,$acc0,4),$acc0 #$t0
mov 2048($sbox,$acc1,4),$acc1 #$t1
mov 2048($sbox,$acc2,4),$acc2 #$t2
and \$0x00ff0000,$acc0
and \$0x00ff0000,$acc1
and \$0x00ff0000,$acc2
xor $acc0,$t0
xor $acc1,$t1
xor $acc2,$t2
movzb `&lo($qs1)`,$acc0
movzb `&hi($qs1)`,$acc1
movzb `&hi($qs2)`,$acc2
mov 2048($sbox,$acc0,4),$acc0 #$t3
mov 2048($sbox,$acc1,4),$acc1 #$t0
mov 2048($sbox,$acc2,4),$acc2 #$t1
and \$0x00ff0000,$acc0
and \$0xff000000,$acc1
and \$0xff000000,$acc2
xor $acc0,$t3
xor $acc1,$t0
xor $acc2,$t1
movzb `&hi($qs3)`,$acc0
movzb `&hi($qs0)`,$acc1
mov 16+12($key),$s3
mov 2048($sbox,$acc0,4),$acc0 #$t2
mov 2048($sbox,$acc1,4),$acc1 #$t3
mov 16+0($key),$s0
and \$0xff000000,$acc0
and \$0xff000000,$acc1
xor $acc0,$t2
xor $acc1,$t3
mov 16+4($key),$s1
mov 16+8($key),$s2
xor $t0,$s0
xor $t1,$s1
xor $t2,$s2
xor $t3,$s3
___
}
sub decstep() sub decstep()
{ my ($i,@s) = @_; { my ($i,@s) = @_;
my $tmp0=$acc0; my $tmp0=$acc0;
@ -230,7 +564,7 @@ sub decstep()
$code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n"; $code.=" and \$0xFF,$out\n";
$code.=" mov 0($tbl,$out,8),$out\n"; $code.=" mov 0($sbox,$out,8),$out\n";
$code.=" shr \$16,$tmp1\n"; $code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3); $tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" mov $s[3],$tmp2\n" if ($i!=3);
@ -240,9 +574,9 @@ sub decstep()
$code.=" and \$0xFF,$tmp1\n"; $code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n"; $code.=" shr \$24,$tmp2\n";
$code.=" xor 3($tbl,$tmp0,8),$out\n"; $code.=" xor 3($sbox,$tmp0,8),$out\n";
$code.=" xor 2($tbl,$tmp1,8),$out\n"; $code.=" xor 2($sbox,$tmp1,8),$out\n";
$code.=" xor 1($tbl,$tmp2,8),$out\n"; $code.=" xor 1($sbox,$tmp2,8),$out\n";
$code.=" mov $t2,$s[1]\n" if ($i==3); $code.=" mov $t2,$s[1]\n" if ($i==3);
$code.=" mov $t1,$s[2]\n" if ($i==3); $code.=" mov $t1,$s[2]\n" if ($i==3);
@ -262,7 +596,7 @@ sub declast()
$code.=" mov $s[2],$tmp1\n" if ($i!=3); $code.=" mov $s[2],$tmp1\n" if ($i!=3);
$code.=" and \$0xFF,$out\n"; $code.=" and \$0xFF,$out\n";
$code.=" mov 2048($tbl,$out,4),$out\n"; $code.=" mov 2048($sbox,$out,4),$out\n";
$code.=" shr \$16,$tmp1\n"; $code.=" shr \$16,$tmp1\n";
$tmp2=$s[3] if ($i==3); $tmp2=$s[3] if ($i==3);
$code.=" mov $s[3],$tmp2\n" if ($i!=3); $code.=" mov $s[3],$tmp2\n" if ($i!=3);
@ -273,9 +607,9 @@ sub declast()
$code.=" and \$0xFF,$tmp1\n"; $code.=" and \$0xFF,$tmp1\n";
$code.=" shr \$24,$tmp2\n"; $code.=" shr \$24,$tmp2\n";
$code.=" mov 2048($tbl,$tmp0,4),$tmp0\n"; $code.=" mov 2048($sbox,$tmp0,4),$tmp0\n";
$code.=" mov 2048($tbl,$tmp1,4),$tmp1\n"; $code.=" mov 2048($sbox,$tmp1,4),$tmp1\n";
$code.=" mov 2048($tbl,$tmp2,4),$tmp2\n"; $code.=" mov 2048($sbox,$tmp2,4),$tmp2\n";
$code.=" and \$0x0000ff00,$tmp0\n"; $code.=" and \$0x0000ff00,$tmp0\n";
$code.=" and \$0x00ff0000,$tmp1\n"; $code.=" and \$0x00ff0000,$tmp1\n";
@ -299,36 +633,42 @@ _x86_64_AES_decrypt:
xor 8($key),$s2 xor 8($key),$s2
xor 12($key),$s3 xor 12($key),$s3
mov 240($key),$cnt # load key->rounds mov 240($key),$rnds # load key->rounds
sub \$1,$cnt sub \$1,$rnds
.align 4 jmp .Ldec_loop
.align 16
.Ldec_loop: .Ldec_loop:
___ ___
&decstep(0,$s0,$s3,$s2,$s1); if ($verticalspin) { &decvert(); }
&decstep(1,$s1,$s0,$s3,$s2); else { &decstep(0,$s0,$s3,$s2,$s1);
&decstep(2,$s2,$s1,$s0,$s3); &decstep(1,$s1,$s0,$s3,$s2);
&decstep(3,$s3,$s2,$s1,$s0); &decstep(2,$s2,$s1,$s0,$s3);
&decstep(3,$s3,$s2,$s1,$s0);
$code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
___
}
$code.=<<___; $code.=<<___;
lea 16($key),$key sub \$1,$rnds
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
sub \$1,$cnt
jnz .Ldec_loop jnz .Ldec_loop
___ ___
&declast(0,$s0,$s3,$s2,$s1); if ($verticalspin) { &declastvert(); }
&declast(1,$s1,$s0,$s3,$s2); else { &declast(0,$s0,$s3,$s2,$s1);
&declast(2,$s2,$s1,$s0,$s3); &declast(1,$s1,$s0,$s3,$s2);
&declast(3,$s3,$s2,$s1,$s0); &declast(2,$s2,$s1,$s0,$s3);
&declast(3,$s3,$s2,$s1,$s0);
$code.=<<___;
xor 16+0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
___
}
$code.=<<___; $code.=<<___;
lea 16($key),$key
xor 0($key),$s0 # xor with key
xor 4($key),$s1
xor 8($key),$s2
xor 12($key),$s3
.byte 0xf3,0xc3 # rep ret .byte 0xf3,0xc3 # rep ret
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt .size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
___ ___
@ -347,9 +687,11 @@ AES_decrypt:
push %r15 push %r15
mov %rdx,$key mov %rdx,$key
mov %rdi,$inp
mov %rsi,$out
.picmeup $tbl .picmeup $sbox
lea AES_Td-.($tbl),$tbl lea AES_Td-.($sbox),$sbox
mov 0($inp),$s0 mov 0($inp),$s0
mov 4($inp),$s1 mov 4($inp),$s1
@ -719,24 +1061,24 @@ AES_cbc_encrypt:
pushfq pushfq
cld cld
.picmeup $tbl .picmeup $sbox
.Lcbc_pic_point: .Lcbc_pic_point:
cmp \$0,%r9 cmp \$0,%r9
je .LDECRYPT je .LDECRYPT
lea AES_Te-.Lcbc_pic_point($tbl),$tbl lea AES_Te-.Lcbc_pic_point($sbox),$sbox
# allocate aligned stack frame... # allocate aligned stack frame...
lea -64-248(%rsp),$key lea -64-248(%rsp),$key
and \$-64,$key and \$-64,$key
# ... and make it doesn't alias with AES_Te modulo 4096 # ... and make it doesn't alias with AES_Te modulo 4096
mov $tbl,%r10 mov $sbox,%r10
lea 2048($tbl),%r11 lea 2048($sbox),%r11
mov $key,%r12 mov $key,%r12
and \$0xFFF,%r10 # s = $tbl&0xfff and \$0xFFF,%r10 # s = $sbox&0xfff
and \$0xFFF,%r11 # e = ($tbl+2048)&0xfff and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
and \$0xFFF,%r12 # p = %rsp&0xfff and \$0xFFF,%r12 # p = %rsp&0xfff
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
@ -758,13 +1100,15 @@ AES_cbc_encrypt:
mov %rdx,$_len # save copy of len mov %rdx,$_len # save copy of len
mov %rcx,$_key # save copy of key mov %rcx,$_key # save copy of key
mov %r8,$_ivp # save copy of ivp mov %r8,$_ivp # save copy of ivp
movl \$0,$mark # copy of aes_key->rounds = 0; movl \$0,$mark # copy of aes_key->rounds = 0;
mov %r8,%rbp # rearrange input arguments
mov %rsi,$out
mov %rdi,$inp
mov %rcx,$key mov %rcx,$key
# do we copy key schedule to stack? # do we copy key schedule to stack?
mov $key,%r10 mov $key,%r10
sub $tbl,%r10 sub $sbox,%r10
and \$0xfff,%r10 and \$0xfff,%r10
cmp \$2048,%r10 cmp \$2048,%r10
jb .Lcbc_do_ecopy jb .Lcbc_do_ecopy
@ -772,8 +1116,6 @@ AES_cbc_encrypt:
jb .Lcbc_skip_ecopy jb .Lcbc_skip_ecopy
.align 4 .align 4
.Lcbc_do_ecopy: .Lcbc_do_ecopy:
mov %rsi,%r10 # backup $inp,$out
mov %rdi,%r11
mov $key,%rsi mov $key,%rsi
lea $aes_key,%rdi lea $aes_key,%rdi
lea $aes_key,$key lea $aes_key,$key
@ -781,29 +1123,27 @@ AES_cbc_encrypt:
.long 0x90A548F3 # rep movsq .long 0x90A548F3 # rep movsq
mov (%rsi),%eax # copy aes_key->rounds mov (%rsi),%eax # copy aes_key->rounds
mov %eax,(%rdi) mov %eax,(%rdi)
mov %r10,%rsi # restore $inp,$out
mov %r11,%rdi
.Lcbc_skip_ecopy: .Lcbc_skip_ecopy:
mov $key,$keyp # save key pointer mov $key,$keyp # save key pointer
mov \$16,%ecx mov \$16,%ecx
.align 4 .align 4
.Lcbc_prefetch_te: .Lcbc_prefetch_te:
mov 0($tbl),%r10 mov 0($sbox),%r10
mov 32($tbl),%r11 mov 32($sbox),%r11
mov 64($tbl),%r12 mov 64($sbox),%r12
mov 96($tbl),%r13 mov 96($sbox),%r13
lea 128($tbl),$tbl lea 128($sbox),$sbox
sub \$1,%ecx sub \$1,%ecx
jnz .Lcbc_prefetch_te jnz .Lcbc_prefetch_te
sub \$2048,$tbl sub \$2048,$sbox
test \$-16,%rdx # check upon length test \$-16,%rdx # check upon length
mov %rdx,%r10 mov %rdx,%r10
mov 0(%r8),$s0 # load iv mov 0(%rbp),$s0 # load iv
mov 4(%r8),$s1 mov 4(%rbp),$s1
mov 8(%r8),$s2 mov 8(%rbp),$s2
mov 12(%r8),$s3 mov 12(%rbp),$s3
jz .Lcbc_enc_tail # short input... jz .Lcbc_enc_tail # short input...
.align 4 .align 4
@ -812,10 +1152,12 @@ AES_cbc_encrypt:
xor 4($inp),$s1 xor 4($inp),$s1
xor 8($inp),$s2 xor 8($inp),$s2
xor 12($inp),$s3 xor 12($inp),$s3
mov $inp,$ivec # if ($verticalspin) save inp
mov $keyp,$key # restore key mov $keyp,$key # restore key
call _x86_64_AES_encrypt call _x86_64_AES_encrypt
mov $ivec,$inp # if ($verticalspin) restore inp
mov $s0,0($out) mov $s0,0($out)
mov $s1,4($out) mov $s1,4($out)
mov $s2,8($out) mov $s2,8($out)
@ -830,11 +1172,11 @@ AES_cbc_encrypt:
jnz .Lcbc_enc_loop jnz .Lcbc_enc_loop
test \$15,%r10 test \$15,%r10
jnz .Lcbc_enc_tail jnz .Lcbc_enc_tail
mov $_ivp,%r10 # restore ivp mov $_ivp,%rbp # restore ivp
mov $s0,0(%r10) # save ivec mov $s0,0(%rbp) # save ivec
mov $s1,4(%r10) mov $s1,4(%rbp)
mov $s2,8(%r10) mov $s2,8(%rbp)
mov $s3,12(%r10) mov $s3,12(%rbp)
.align 4 .align 4
.Lcbc_cleanup: .Lcbc_cleanup:
@ -858,36 +1200,34 @@ AES_cbc_encrypt:
.align 4 .align 4
.Lcbc_enc_tail: .Lcbc_enc_tail:
cmp $inp,$out cmp $inp,$out
mov $inp,%r11
mov $out,%r12
je .Lcbc_enc_in_place je .Lcbc_enc_in_place
mov %r10,%rcx mov %r10,%rcx
xchg %rsi,%rdi mov $inp,%rsi
mov $out,%rdi
.long 0xF689A4F3 # rep movsb .long 0xF689A4F3 # rep movsb
.Lcbc_enc_in_place: .Lcbc_enc_in_place:
mov \$16,%rcx # zero tail mov \$16,%rcx # zero tail
sub %r10,%rcx sub %r10,%rcx
xor %rax,%rax xor %rax,%rax
.long 0xF689AAF3 # rep stosb .long 0xF689AAF3 # rep stosb
mov %r12,$inp # this is not a mistake! mov $out,$inp # this is not a mistake!
mov %r12,$out
movq \$16,$_len # len=16 movq \$16,$_len # len=16
jmp .Lcbc_enc_loop # one more spin... jmp .Lcbc_enc_loop # one more spin...
#----------------------------- DECRYPT -----------------------------# #----------------------------- DECRYPT -----------------------------#
.align 16 .align 16
.LDECRYPT: .LDECRYPT:
lea AES_Td-.Lcbc_pic_point($tbl),$tbl lea AES_Td-.Lcbc_pic_point($sbox),$sbox
# allocate aligned stack frame... # allocate aligned stack frame...
lea -64-248(%rsp),$key lea -64-248(%rsp),$key
and \$-64,$key and \$-64,$key
# ... and make it doesn't alias with AES_Td modulo 4096 # ... and make it doesn't alias with AES_Td modulo 4096
mov $tbl,%r10 mov $sbox,%r10
lea 3072($tbl),%r11 lea 3072($sbox),%r11
mov $key,%r12 mov $key,%r12
and \$0xFFF,%r10 # s = $tbl&0xfff and \$0xFFF,%r10 # s = $sbox&0xfff
and \$0xFFF,%r11 # e = ($tbl+2048)&0xfff and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
and \$0xFFF,%r12 # p = %rsp&0xfff and \$0xFFF,%r12 # p = %rsp&0xfff
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e); cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
@ -909,13 +1249,15 @@ AES_cbc_encrypt:
mov %rdx,$_len # save copy of len mov %rdx,$_len # save copy of len
mov %rcx,$_key # save copy of key mov %rcx,$_key # save copy of key
mov %r8,$_ivp # save copy of ivp mov %r8,$_ivp # save copy of ivp
movl \$0,$mark # copy of aes_key->rounds = 0; movl \$0,$mark # copy of aes_key->rounds = 0;
mov %r8,%rbp # rearrange input arguments
mov %rsi,$out
mov %rdi,$inp
mov %rcx,$key mov %rcx,$key
# do we copy key schedule to stack? # do we copy key schedule to stack?
mov $key,%r10 mov $key,%r10
sub $tbl,%r10 sub $sbox,%r10
and \$0xfff,%r10 and \$0xfff,%r10
cmp \$3072,%r10 cmp \$3072,%r10
jb .Lcbc_do_dcopy jb .Lcbc_do_dcopy
@ -923,8 +1265,6 @@ AES_cbc_encrypt:
jb .Lcbc_skip_dcopy jb .Lcbc_skip_dcopy
.align 4 .align 4
.Lcbc_do_dcopy: .Lcbc_do_dcopy:
mov %rsi,%r10 # backup $inp,$out
mov %rdi,%r11
mov $key,%rsi mov $key,%rsi
lea $aes_key,%rdi lea $aes_key,%rdi
lea $aes_key,$key lea $aes_key,$key
@ -932,51 +1272,51 @@ AES_cbc_encrypt:
.long 0x90A548F3 # rep movsq .long 0x90A548F3 # rep movsq
mov (%rsi),%eax # copy aes_key->rounds mov (%rsi),%eax # copy aes_key->rounds
mov %eax,(%rdi) mov %eax,(%rdi)
mov %r10,%rsi # restore $inp,$out
mov %r11,%rdi
.Lcbc_skip_dcopy: .Lcbc_skip_dcopy:
mov $key,$keyp # save key pointer mov $key,$keyp # save key pointer
mov \$24,%ecx mov \$24,%ecx
.align 4 .align 4
.Lcbc_prefetch_td: .Lcbc_prefetch_td:
mov 0($tbl),%r10 mov 0($sbox),%r10
mov 32($tbl),%r11 mov 32($sbox),%r11
mov 64($tbl),%r12 mov 64($sbox),%r12
mov 96($tbl),%r13 mov 96($sbox),%r13
lea 128($tbl),$tbl lea 128($sbox),$sbox
sub \$1,%ecx sub \$1,%ecx
jnz .Lcbc_prefetch_td jnz .Lcbc_prefetch_td
sub \$3072,$tbl sub \$3072,$sbox
cmp $inp,$out cmp $inp,$out
je .Lcbc_dec_in_place je .Lcbc_dec_in_place
mov %r8,$ivec mov %rbp,$ivec
.align 4 .align 4
.Lcbc_dec_loop: .Lcbc_dec_loop:
mov 0($inp),$s0 # read input mov 0($inp),$s0 # read input
mov 4($inp),$s1 mov 4($inp),$s1
mov 8($inp),$s2 mov 8($inp),$s2
mov 12($inp),$s3 mov 12($inp),$s3
mov $inp,8+$ivec # if ($verticalspin) save inp
mov $keyp,$key # load key mov $keyp,$key # restore key
call _x86_64_AES_decrypt call _x86_64_AES_decrypt
mov $ivec,%r8 # load ivp mov $ivec,%rbp # load ivp
xor 0(%r8),$s0 # xor iv mov 8+$ivec,$inp # if ($verticalspin) restore inp
xor 4(%r8),$s1 xor 0(%rbp),$s0 # xor iv
xor 8(%r8),$s2 xor 4(%rbp),$s1
xor 12(%r8),$s3 xor 8(%rbp),$s2
mov $inp,%r8 # current input, next iv xor 12(%rbp),$s3
mov $inp,%rbp # current input, next iv
mov $_len,%r10 # load len mov $_len,%r10 # load len
sub \$16,%r10 sub \$16,%r10
jc .Lcbc_dec_partial jc .Lcbc_dec_partial
mov %r10,$_len # update len mov %r10,$_len # update len
mov %r8,$ivec # update ivp mov %rbp,$ivec # update ivp
mov $s0,0($out) # write output mov $s0,0($out) # write output
mov $s1,4($out) mov $s1,4($out)
mov $s2,8($out) mov $s2,8($out)
mov $s3,12($out) mov $s3,12($out)
@ -985,11 +1325,11 @@ AES_cbc_encrypt:
lea 16($out),$out lea 16($out),$out
jnz .Lcbc_dec_loop jnz .Lcbc_dec_loop
.Lcbc_dec_end: .Lcbc_dec_end:
mov $_ivp,%r9 # load user ivp mov $_ivp,%r12 # load user ivp
mov 0(%r8),%r10 # load iv mov 0(%rbp),%r10 # load iv
mov 8(%r8),%r11 mov 8(%rbp),%r11
mov %r10,0(%r9) # copy back to user mov %r10,0(%r12) # copy back to user
mov %r11,8(%r9) mov %r11,8(%r12)
jmp .Lcbc_cleanup jmp .Lcbc_cleanup
.align 4 .align 4
@ -1007,26 +1347,28 @@ AES_cbc_encrypt:
.align 16 .align 16
.Lcbc_dec_in_place: .Lcbc_dec_in_place:
mov 0($inp),$s0 # load input mov 0($inp),$s0 # load input
mov 4($inp),$s1 mov 4($inp),$s1
mov 8($inp),$s2 mov 8($inp),$s2
mov 12($inp),$s3 mov 12($inp),$s3
mov $inp,$ivec # if ($verticalspin) save inp
mov $keyp,$key mov $keyp,$key
call _x86_64_AES_decrypt call _x86_64_AES_decrypt
mov $_ivp,%r8 mov $ivec,$inp # if ($verticalspin) restore inp
xor 0(%r8),$s0 mov $_ivp,%rbp
xor 4(%r8),$s1 xor 0(%rbp),$s0
xor 8(%r8),$s2 xor 4(%rbp),$s1
xor 12(%r8),$s3 xor 8(%rbp),$s2
xor 12(%rbp),$s3
mov 0($inp),%r10 # copy input to iv mov 0($inp),%r10 # copy input to iv
mov 8($inp),%r11 mov 8($inp),%r11
mov %r10,0(%r8) mov %r10,0(%rbp)
mov %r11,8(%r8) mov %r11,8(%rbp)
mov $s0,0($out) # save output [zaps input] mov $s0,0($out) # save output [zaps input]
mov $s1,4($out) mov $s1,4($out)
mov $s2,8($out) mov $s2,8($out)
mov $s3,12($out) mov $s3,12($out)
@ -1044,7 +1386,7 @@ AES_cbc_encrypt:
.Lcbc_dec_in_place_partial: .Lcbc_dec_in_place_partial:
# one can argue if this is actually required # one can argue if this is actually required
lea ($out,%rcx),%rdi lea ($out,%rcx),%rdi
lea (%r8,%rcx),%rsi lea (%rbp,%rcx),%rsi
neg %rcx neg %rcx
.long 0xF689A4F3 # rep movsb # restore tail .long 0xF689A4F3 # rep movsb # restore tail
jmp .Lcbc_cleanup jmp .Lcbc_cleanup
@ -1262,6 +1604,8 @@ ___
&data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363); &data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
&data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d); &data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code; print $code;
close STDOUT; close STDOUT;

View File

@ -167,10 +167,12 @@ my $current_function;
my $self = shift; my $self = shift;
my $sz = shift; my $sz = shift;
# silently convert all EAs to 64-bit, required for elder GNU # Silently convert all EAs to 64-bit. This is required for
# assembler and results in more compact code # elder GNU assembler and results in more compact code,
$self->{index} =~ s/^[er](.?[0-9xp])[d]?$/r\1/; # *but* most importantly AES module depends on this feature!
$self->{base} =~ s/^[er](.?[0-9xp])[d]?$/r\1/; $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
$self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
if (!$masm) { if (!$masm) {
# Solaris /usr/ccs/bin/as can't handle multiplications # Solaris /usr/ccs/bin/as can't handle multiplications
# in $self->{label} # in $self->{label}