~15% better AES x86_64 assembler.
This commit is contained in:
parent
f42e6d24f2
commit
afbe674edb
@ -6,9 +6,9 @@
|
|||||||
# forms are granted according to the OpenSSL license.
|
# forms are granted according to the OpenSSL license.
|
||||||
# ====================================================================
|
# ====================================================================
|
||||||
#
|
#
|
||||||
# Version 1.0.
|
# Version 1.1.
|
||||||
#
|
#
|
||||||
# aes-*-cbc benchmarks are improved by 50% [compared to gcc 3.3.2 on
|
# aes-*-cbc benchmarks are improved by >70% [compared to gcc 3.3.2 on
|
||||||
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
|
# Opteron 240 CPU] plus all the bells-n-whistles from 32-bit version
|
||||||
# [you'll notice a lot of resemblance], such as compressed S-boxes
|
# [you'll notice a lot of resemblance], such as compressed S-boxes
|
||||||
# in little-endian byte order, prefetch of these tables in CBC mode,
|
# in little-endian byte order, prefetch of these tables in CBC mode,
|
||||||
@ -18,14 +18,14 @@
|
|||||||
# Performance in number of cycles per processed byte for 128-bit key:
|
# Performance in number of cycles per processed byte for 128-bit key:
|
||||||
#
|
#
|
||||||
# ECB CBC encrypt
|
# ECB CBC encrypt
|
||||||
# AMD64 15.6 14.6(*)
|
# AMD64 13.7 13.0(*)
|
||||||
# EM64T 23.3(**) 21.4(*)
|
# EM64T 20.2 18.6(*)
|
||||||
#
|
#
|
||||||
# (*) CBC benchmarks are better than ECB thanks to custom ABI used
|
# (*) CBC benchmarks are better than ECB thanks to custom ABI used
|
||||||
# by the private block encryption function.
|
# by the private block encryption function.
|
||||||
# (**) This module exhibits virtually same ECB performance as 32-bit
|
|
||||||
# counterpart on [current] Intel CPU.
|
|
||||||
|
|
||||||
|
$verticalspin=1; # unlike 32-bit version $verticalspin performs
|
||||||
|
# ~15% better on both AMD and Intel cores
|
||||||
$output=shift;
|
$output=shift;
|
||||||
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
|
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
|
||||||
|
|
||||||
@ -35,20 +35,22 @@ $s0="%eax";
|
|||||||
$s1="%ebx";
|
$s1="%ebx";
|
||||||
$s2="%ecx";
|
$s2="%ecx";
|
||||||
$s3="%edx";
|
$s3="%edx";
|
||||||
$inp="%rdi";
|
$acc0="%esi";
|
||||||
$out="%rsi";
|
$acc1="%edi";
|
||||||
$acc0="%ebp";
|
$acc2="%ebp";
|
||||||
$acc1="%r8d";
|
$inp="%r8";
|
||||||
$acc2="%r9d";
|
$out="%r9";
|
||||||
$t0="%r10d";
|
$t0="%r10d";
|
||||||
$t1="%r11d";
|
$t1="%r11d";
|
||||||
$t2="%r12d";
|
$t2="%r12d";
|
||||||
$cnt="%r13d";
|
$rnds="%r13d";
|
||||||
$tbl="%r14";
|
$sbox="%r14";
|
||||||
$key="%r15";
|
$key="%r15";
|
||||||
|
|
||||||
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
|
sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
|
||||||
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/; $r; }
|
sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
|
||||||
|
$r =~ s/%[er]([sd]i)/%\1l/;
|
||||||
|
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
||||||
sub _data_word()
|
sub _data_word()
|
||||||
{ my $i;
|
{ my $i;
|
||||||
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
|
while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
|
||||||
@ -61,6 +63,169 @@ sub data_word()
|
|||||||
$code.=sprintf"0x%08x\n",$last;
|
$code.=sprintf"0x%08x\n",$last;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub encvert()
|
||||||
|
{ my $t3="%r8d"; # zaps $inp!
|
||||||
|
my $qs0='"$s0"';
|
||||||
|
my $qs1='"$s1"';
|
||||||
|
my $qs2='"$s2"';
|
||||||
|
my $qs3='"$s3"';
|
||||||
|
|
||||||
|
$code.=<<___;
|
||||||
|
# favor 3-way issue Opteron pipeline...
|
||||||
|
movzb `&lo($qs0)`,$acc0
|
||||||
|
movzb `&lo($qs1)`,$acc1
|
||||||
|
movzb `&lo($qs2)`,$acc2
|
||||||
|
mov 0($sbox,$acc0,8),$t0
|
||||||
|
mov 0($sbox,$acc1,8),$t1
|
||||||
|
mov 0($sbox,$acc2,8),$t2
|
||||||
|
|
||||||
|
movzb `&hi($qs1)`,$acc0
|
||||||
|
movzb `&hi($qs2)`,$acc1
|
||||||
|
movzb `&lo($qs3)`,$acc2
|
||||||
|
xor 3($sbox,$acc0,8),$t0
|
||||||
|
xor 3($sbox,$acc1,8),$t1
|
||||||
|
mov 0($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
movzb `&hi($qs3)`,$acc0
|
||||||
|
shr \$16,$s2
|
||||||
|
movzb `&hi($qs0)`,$acc2
|
||||||
|
xor 3($sbox,$acc0,8),$t2
|
||||||
|
shr \$16,$s3
|
||||||
|
xor 3($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
shr \$16,$s1
|
||||||
|
lea 16($key),$key
|
||||||
|
shr \$16,$s0
|
||||||
|
|
||||||
|
movzb `&lo($qs2)`,$acc0
|
||||||
|
movzb `&lo($qs3)`,$acc1
|
||||||
|
movzb `&lo($qs0)`,$acc2
|
||||||
|
xor 2($sbox,$acc0,8),$t0
|
||||||
|
xor 2($sbox,$acc1,8),$t1
|
||||||
|
xor 2($sbox,$acc2,8),$t2
|
||||||
|
|
||||||
|
movzb `&hi($qs3)`,$acc0
|
||||||
|
movzb `&hi($qs0)`,$acc1
|
||||||
|
movzb `&lo($qs1)`,$acc2
|
||||||
|
xor 1($sbox,$acc0,8),$t0
|
||||||
|
xor 1($sbox,$acc1,8),$t1
|
||||||
|
xor 2($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
mov 12($key),$s3
|
||||||
|
movzb `&hi($qs1)`,$acc1
|
||||||
|
movzb `&hi($qs2)`,$acc2
|
||||||
|
mov 0($key),$s0
|
||||||
|
xor 1($sbox,$acc1,8),$t2
|
||||||
|
xor 1($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
mov 4($key),$s1
|
||||||
|
mov 8($key),$s2
|
||||||
|
xor $t0,$s0
|
||||||
|
xor $t1,$s1
|
||||||
|
xor $t2,$s2
|
||||||
|
xor $t3,$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
|
|
||||||
|
sub enclastvert()
|
||||||
|
{ my $t3="%r8d"; # zaps $inp!
|
||||||
|
my $qs0='"$s0"';
|
||||||
|
my $qs1='"$s1"';
|
||||||
|
my $qs2='"$s2"';
|
||||||
|
my $qs3='"$s3"';
|
||||||
|
|
||||||
|
$code.=<<___;
|
||||||
|
movzb `&lo($qs0)`,$acc0
|
||||||
|
movzb `&lo($qs1)`,$acc1
|
||||||
|
movzb `&lo($qs2)`,$acc2
|
||||||
|
mov 2($sbox,$acc0,8),$t0
|
||||||
|
mov 2($sbox,$acc1,8),$t1
|
||||||
|
mov 2($sbox,$acc2,8),$t2
|
||||||
|
|
||||||
|
and \$0x000000ff,$t0
|
||||||
|
and \$0x000000ff,$t1
|
||||||
|
and \$0x000000ff,$t2
|
||||||
|
|
||||||
|
movzb `&lo($qs3)`,$acc0
|
||||||
|
movzb `&hi($qs1)`,$acc1
|
||||||
|
movzb `&hi($qs2)`,$acc2
|
||||||
|
mov 2($sbox,$acc0,8),$t3
|
||||||
|
mov 0($sbox,$acc1,8),$acc1 #$t0
|
||||||
|
mov 0($sbox,$acc2,8),$acc2 #$t1
|
||||||
|
|
||||||
|
and \$0x000000ff,$t3
|
||||||
|
and \$0x0000ff00,$acc1
|
||||||
|
and \$0x0000ff00,$acc2
|
||||||
|
|
||||||
|
xor $acc1,$t0
|
||||||
|
xor $acc2,$t1
|
||||||
|
shr \$16,$s2
|
||||||
|
|
||||||
|
movzb `&hi($qs3)`,$acc0
|
||||||
|
movzb `&hi($qs0)`,$acc1
|
||||||
|
shr \$16,$s3
|
||||||
|
mov 0($sbox,$acc0,8),$acc0 #$t2
|
||||||
|
mov 0($sbox,$acc1,8),$acc1 #$t3
|
||||||
|
|
||||||
|
and \$0x0000ff00,$acc0
|
||||||
|
and \$0x0000ff00,$acc1
|
||||||
|
shr \$16,$s1
|
||||||
|
xor $acc0,$t2
|
||||||
|
xor $acc1,$t3
|
||||||
|
shr \$16,$s0
|
||||||
|
|
||||||
|
movzb `&lo($qs2)`,$acc0
|
||||||
|
movzb `&lo($qs3)`,$acc1
|
||||||
|
movzb `&lo($qs0)`,$acc2
|
||||||
|
mov 0($sbox,$acc0,8),$acc0 #$t0
|
||||||
|
mov 0($sbox,$acc1,8),$acc1 #$t1
|
||||||
|
mov 0($sbox,$acc2,8),$acc2 #$t2
|
||||||
|
|
||||||
|
and \$0x00ff0000,$acc0
|
||||||
|
and \$0x00ff0000,$acc1
|
||||||
|
and \$0x00ff0000,$acc2
|
||||||
|
|
||||||
|
xor $acc0,$t0
|
||||||
|
xor $acc1,$t1
|
||||||
|
xor $acc2,$t2
|
||||||
|
|
||||||
|
movzb `&lo($qs1)`,$acc0
|
||||||
|
movzb `&hi($qs3)`,$acc1
|
||||||
|
movzb `&hi($qs0)`,$acc2
|
||||||
|
mov 0($sbox,$acc0,8),$acc0 #$t3
|
||||||
|
mov 2($sbox,$acc1,8),$acc1 #$t0
|
||||||
|
mov 2($sbox,$acc2,8),$acc2 #$t1
|
||||||
|
|
||||||
|
and \$0x00ff0000,$acc0
|
||||||
|
and \$0xff000000,$acc1
|
||||||
|
and \$0xff000000,$acc2
|
||||||
|
|
||||||
|
xor $acc0,$t3
|
||||||
|
xor $acc1,$t0
|
||||||
|
xor $acc2,$t1
|
||||||
|
|
||||||
|
movzb `&hi($qs1)`,$acc0
|
||||||
|
movzb `&hi($qs2)`,$acc1
|
||||||
|
mov 16+12($key),$s3
|
||||||
|
mov 2($sbox,$acc0,8),$acc0 #$t2
|
||||||
|
mov 2($sbox,$acc1,8),$acc1 #$t3
|
||||||
|
mov 16+0($key),$s0
|
||||||
|
|
||||||
|
and \$0xff000000,$acc0
|
||||||
|
and \$0xff000000,$acc1
|
||||||
|
|
||||||
|
xor $acc0,$t2
|
||||||
|
xor $acc1,$t3
|
||||||
|
|
||||||
|
mov 16+4($key),$s1
|
||||||
|
mov 16+8($key),$s2
|
||||||
|
xor $t0,$s0
|
||||||
|
xor $t1,$s1
|
||||||
|
xor $t2,$s2
|
||||||
|
xor $t3,$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
|
|
||||||
sub encstep()
|
sub encstep()
|
||||||
{ my ($i,@s) = @_;
|
{ my ($i,@s) = @_;
|
||||||
my $tmp0=$acc0;
|
my $tmp0=$acc0;
|
||||||
@ -68,24 +233,28 @@ sub encstep()
|
|||||||
my $tmp2=$acc2;
|
my $tmp2=$acc2;
|
||||||
my $out=($t0,$t1,$t2,$s[0])[$i];
|
my $out=($t0,$t1,$t2,$s[0])[$i];
|
||||||
|
|
||||||
$code.=" mov $s[0],$out\n" if ($i!=3);
|
if ($i==3) {
|
||||||
$tmp1=$s[2] if ($i==3);
|
$tmp0=$s[1];
|
||||||
|
$tmp1=$s[2];
|
||||||
|
$tmp2=$s[3];
|
||||||
|
}
|
||||||
|
$code.=" movzb ".&lo($s[0]).",$out\n";
|
||||||
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
||||||
$code.=" and \$0xFF,$out\n";
|
$code.=" lea 16($key),$key\n" if ($i==0);
|
||||||
|
|
||||||
$code.=" mov 0($tbl,$out,8),$out\n";
|
|
||||||
$code.=" shr \$16,$tmp1\n";
|
|
||||||
$tmp2=$s[3] if ($i==3);
|
|
||||||
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
|
||||||
|
|
||||||
$tmp0=$s[1] if ($i==3);
|
|
||||||
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
|
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
|
||||||
$code.=" and \$0xFF,$tmp1\n";
|
$code.=" mov 0($sbox,$out,8),$out\n";
|
||||||
$code.=" shr \$24,$tmp2\n";
|
|
||||||
|
|
||||||
$code.=" xor 3($tbl,$tmp0,8),$out\n";
|
$code.=" shr \$16,$tmp1\n";
|
||||||
$code.=" xor 2($tbl,$tmp1,8),$out\n";
|
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
||||||
$code.=" xor 1($tbl,$tmp2,8),$out\n";
|
$code.=" xor 3($sbox,$tmp0,8),$out\n";
|
||||||
|
|
||||||
|
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
|
||||||
|
$code.=" shr \$24,$tmp2\n";
|
||||||
|
$code.=" xor 4*$i($key),$out\n";
|
||||||
|
|
||||||
|
$code.=" xor 2($sbox,$tmp1,8),$out\n";
|
||||||
|
$code.=" xor 1($sbox,$tmp2,8),$out\n";
|
||||||
|
|
||||||
$code.=" mov $t0,$s[1]\n" if ($i==3);
|
$code.=" mov $t0,$s[1]\n" if ($i==3);
|
||||||
$code.=" mov $t1,$s[2]\n" if ($i==3);
|
$code.=" mov $t1,$s[2]\n" if ($i==3);
|
||||||
@ -100,25 +269,26 @@ sub enclast()
|
|||||||
my $tmp2=$acc2;
|
my $tmp2=$acc2;
|
||||||
my $out=($t0,$t1,$t2,$s[0])[$i];
|
my $out=($t0,$t1,$t2,$s[0])[$i];
|
||||||
|
|
||||||
$code.=" mov $s[0],$out\n" if ($i!=3);
|
if ($i==3) {
|
||||||
$tmp1=$s[2] if ($i==3);
|
$tmp0=$s[1];
|
||||||
|
$tmp1=$s[2];
|
||||||
|
$tmp2=$s[3];
|
||||||
|
}
|
||||||
|
$code.=" movzb ".&lo($s[0]).",$out\n";
|
||||||
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
||||||
$code.=" and \$0xFF,$out\n";
|
|
||||||
|
|
||||||
$code.=" mov 2($tbl,$out,8),$out\n";
|
$code.=" mov 2($sbox,$out,8),$out\n";
|
||||||
$code.=" shr \$16,$tmp1\n";
|
$code.=" shr \$16,$tmp1\n";
|
||||||
$tmp2=$s[3] if ($i==3);
|
|
||||||
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
||||||
|
|
||||||
$code.=" and \$0x000000ff,$out\n";
|
$code.=" and \$0x000000ff,$out\n";
|
||||||
$tmp0=$s[1] if ($i==3);
|
|
||||||
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
|
$code.=" movzb ".&hi($s[1]).",$tmp0\n";
|
||||||
$code.=" and \$0xFF,$tmp1\n";
|
$code.=" movzb ".&lo($tmp1).",$tmp1\n";
|
||||||
$code.=" shr \$24,$tmp2\n";
|
$code.=" shr \$24,$tmp2\n";
|
||||||
|
|
||||||
$code.=" mov 0($tbl,$tmp0,8),$tmp0\n";
|
$code.=" mov 0($sbox,$tmp0,8),$tmp0\n";
|
||||||
$code.=" mov 0($tbl,$tmp1,8),$tmp1\n";
|
$code.=" mov 0($sbox,$tmp1,8),$tmp1\n";
|
||||||
$code.=" mov 2($tbl,$tmp2,8),$tmp2\n";
|
$code.=" mov 2($sbox,$tmp2,8),$tmp2\n";
|
||||||
|
|
||||||
$code.=" and \$0x0000ff00,$tmp0\n";
|
$code.=" and \$0x0000ff00,$tmp0\n";
|
||||||
$code.=" and \$0x00ff0000,$tmp1\n";
|
$code.=" and \$0x00ff0000,$tmp1\n";
|
||||||
@ -142,36 +312,35 @@ _x86_64_AES_encrypt:
|
|||||||
xor 8($key),$s2
|
xor 8($key),$s2
|
||||||
xor 12($key),$s3
|
xor 12($key),$s3
|
||||||
|
|
||||||
mov 240($key),$cnt # load key->rounds
|
mov 240($key),$rnds # load key->rounds
|
||||||
sub \$1,$cnt
|
sub \$1,$rnds
|
||||||
.align 4
|
jmp .Lenc_loop
|
||||||
|
.align 16
|
||||||
.Lenc_loop:
|
.Lenc_loop:
|
||||||
___
|
___
|
||||||
&encstep(0,$s0,$s1,$s2,$s3);
|
if ($verticalspin) { &encvert(); }
|
||||||
&encstep(1,$s1,$s2,$s3,$s0);
|
else { &encstep(0,$s0,$s1,$s2,$s3);
|
||||||
&encstep(2,$s2,$s3,$s0,$s1);
|
&encstep(1,$s1,$s2,$s3,$s0);
|
||||||
&encstep(3,$s3,$s0,$s1,$s2);
|
&encstep(2,$s2,$s3,$s0,$s1);
|
||||||
|
&encstep(3,$s3,$s0,$s1,$s2);
|
||||||
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
lea 16($key),$key
|
sub \$1,$rnds
|
||||||
xor 0($key),$s0 # xor with key
|
|
||||||
xor 4($key),$s1
|
|
||||||
xor 8($key),$s2
|
|
||||||
xor 12($key),$s3
|
|
||||||
|
|
||||||
sub \$1,$cnt
|
|
||||||
jnz .Lenc_loop
|
jnz .Lenc_loop
|
||||||
___
|
___
|
||||||
&enclast(0,$s0,$s1,$s2,$s3);
|
if ($verticalspin) { &enclastvert(); }
|
||||||
&enclast(1,$s1,$s2,$s3,$s0);
|
else { &enclast(0,$s0,$s1,$s2,$s3);
|
||||||
&enclast(2,$s2,$s3,$s0,$s1);
|
&enclast(1,$s1,$s2,$s3,$s0);
|
||||||
&enclast(3,$s3,$s0,$s1,$s2);
|
&enclast(2,$s2,$s3,$s0,$s1);
|
||||||
|
&enclast(3,$s3,$s0,$s1,$s2);
|
||||||
|
$code.=<<___;
|
||||||
|
xor 16+0($key),$s0 # xor with key
|
||||||
|
xor 16+4($key),$s1
|
||||||
|
xor 16+8($key),$s2
|
||||||
|
xor 16+12($key),$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
lea 16($key),$key
|
|
||||||
xor 0($key),$s0 # xor with key
|
|
||||||
xor 4($key),$s1
|
|
||||||
xor 8($key),$s2
|
|
||||||
xor 12($key),$s3
|
|
||||||
|
|
||||||
.byte 0xf3,0xc3 # rep ret
|
.byte 0xf3,0xc3 # rep ret
|
||||||
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
|
.size _x86_64_AES_encrypt,.-_x86_64_AES_encrypt
|
||||||
___
|
___
|
||||||
@ -190,9 +359,11 @@ AES_encrypt:
|
|||||||
push %r15
|
push %r15
|
||||||
|
|
||||||
mov %rdx,$key
|
mov %rdx,$key
|
||||||
|
mov %rdi,$inp
|
||||||
|
mov %rsi,$out
|
||||||
|
|
||||||
.picmeup $tbl
|
.picmeup $sbox
|
||||||
lea AES_Te-.($tbl),$tbl
|
lea AES_Te-.($sbox),$sbox
|
||||||
|
|
||||||
mov 0($inp),$s0
|
mov 0($inp),$s0
|
||||||
mov 4($inp),$s1
|
mov 4($inp),$s1
|
||||||
@ -218,6 +389,169 @@ ___
|
|||||||
|
|
||||||
#------------------------------------------------------------------#
|
#------------------------------------------------------------------#
|
||||||
|
|
||||||
|
sub decvert()
|
||||||
|
{ my $t3="%r8d"; # zaps $inp!
|
||||||
|
my $qs0='"$s0"';
|
||||||
|
my $qs1='"$s1"';
|
||||||
|
my $qs2='"$s2"';
|
||||||
|
my $qs3='"$s3"';
|
||||||
|
|
||||||
|
$code.=<<___;
|
||||||
|
# favor 3-way issue Opteron pipeline...
|
||||||
|
movzb `&lo($qs0)`,$acc0
|
||||||
|
movzb `&lo($qs1)`,$acc1
|
||||||
|
movzb `&lo($qs2)`,$acc2
|
||||||
|
mov 0($sbox,$acc0,8),$t0
|
||||||
|
mov 0($sbox,$acc1,8),$t1
|
||||||
|
mov 0($sbox,$acc2,8),$t2
|
||||||
|
|
||||||
|
movzb `&hi($qs3)`,$acc0
|
||||||
|
movzb `&hi($qs0)`,$acc1
|
||||||
|
movzb `&lo($qs3)`,$acc2
|
||||||
|
xor 3($sbox,$acc0,8),$t0
|
||||||
|
xor 3($sbox,$acc1,8),$t1
|
||||||
|
mov 0($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
movzb `&hi($qs1)`,$acc0
|
||||||
|
shr \$16,$s0
|
||||||
|
movzb `&hi($qs2)`,$acc2
|
||||||
|
xor 3($sbox,$acc0,8),$t2
|
||||||
|
shr \$16,$s3
|
||||||
|
xor 3($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
shr \$16,$s1
|
||||||
|
lea 16($key),$key
|
||||||
|
shr \$16,$s2
|
||||||
|
|
||||||
|
movzb `&lo($qs2)`,$acc0
|
||||||
|
movzb `&lo($qs3)`,$acc1
|
||||||
|
movzb `&lo($qs0)`,$acc2
|
||||||
|
xor 2($sbox,$acc0,8),$t0
|
||||||
|
xor 2($sbox,$acc1,8),$t1
|
||||||
|
xor 2($sbox,$acc2,8),$t2
|
||||||
|
|
||||||
|
movzb `&hi($qs1)`,$acc0
|
||||||
|
movzb `&hi($qs2)`,$acc1
|
||||||
|
movzb `&lo($qs1)`,$acc2
|
||||||
|
xor 1($sbox,$acc0,8),$t0
|
||||||
|
xor 1($sbox,$acc1,8),$t1
|
||||||
|
xor 2($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
movzb `&hi($qs3)`,$acc0
|
||||||
|
mov 12($key),$s3
|
||||||
|
movzb `&hi($qs0)`,$acc2
|
||||||
|
xor 1($sbox,$acc0,8),$t2
|
||||||
|
mov 0($key),$s0
|
||||||
|
xor 1($sbox,$acc2,8),$t3
|
||||||
|
|
||||||
|
xor $t0,$s0
|
||||||
|
mov 4($key),$s1
|
||||||
|
mov 8($key),$s2
|
||||||
|
xor $t2,$s2
|
||||||
|
xor $t1,$s1
|
||||||
|
xor $t3,$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
|
|
||||||
|
sub declastvert()
|
||||||
|
{ my $t3="%r8d"; # zaps $inp!
|
||||||
|
my $qs0='"$s0"';
|
||||||
|
my $qs1='"$s1"';
|
||||||
|
my $qs2='"$s2"';
|
||||||
|
my $qs3='"$s3"';
|
||||||
|
|
||||||
|
$code.=<<___;
|
||||||
|
movzb `&lo($qs0)`,$acc0
|
||||||
|
movzb `&lo($qs1)`,$acc1
|
||||||
|
movzb `&lo($qs2)`,$acc2
|
||||||
|
mov 2048($sbox,$acc0,4),$t0
|
||||||
|
mov 2048($sbox,$acc1,4),$t1
|
||||||
|
mov 2048($sbox,$acc2,4),$t2
|
||||||
|
|
||||||
|
and \$0x000000ff,$t0
|
||||||
|
and \$0x000000ff,$t1
|
||||||
|
and \$0x000000ff,$t2
|
||||||
|
|
||||||
|
movzb `&lo($qs3)`,$acc0
|
||||||
|
movzb `&hi($qs3)`,$acc1
|
||||||
|
movzb `&hi($qs0)`,$acc2
|
||||||
|
mov 2048($sbox,$acc0,4),$t3
|
||||||
|
mov 2048($sbox,$acc1,4),$acc1 #$t0
|
||||||
|
mov 2048($sbox,$acc2,4),$acc2 #$t1
|
||||||
|
|
||||||
|
and \$0x000000ff,$t3
|
||||||
|
and \$0x0000ff00,$acc1
|
||||||
|
and \$0x0000ff00,$acc2
|
||||||
|
|
||||||
|
xor $acc1,$t0
|
||||||
|
xor $acc2,$t1
|
||||||
|
shr \$16,$s3
|
||||||
|
|
||||||
|
movzb `&hi($qs1)`,$acc0
|
||||||
|
movzb `&hi($qs2)`,$acc1
|
||||||
|
shr \$16,$s0
|
||||||
|
mov 2048($sbox,$acc0,4),$acc0 #$t2
|
||||||
|
mov 2048($sbox,$acc1,4),$acc1 #$t3
|
||||||
|
|
||||||
|
and \$0x0000ff00,$acc0
|
||||||
|
and \$0x0000ff00,$acc1
|
||||||
|
shr \$16,$s1
|
||||||
|
xor $acc0,$t2
|
||||||
|
xor $acc1,$t3
|
||||||
|
shr \$16,$s2
|
||||||
|
|
||||||
|
movzb `&lo($qs2)`,$acc0
|
||||||
|
movzb `&lo($qs3)`,$acc1
|
||||||
|
movzb `&lo($qs0)`,$acc2
|
||||||
|
mov 2048($sbox,$acc0,4),$acc0 #$t0
|
||||||
|
mov 2048($sbox,$acc1,4),$acc1 #$t1
|
||||||
|
mov 2048($sbox,$acc2,4),$acc2 #$t2
|
||||||
|
|
||||||
|
and \$0x00ff0000,$acc0
|
||||||
|
and \$0x00ff0000,$acc1
|
||||||
|
and \$0x00ff0000,$acc2
|
||||||
|
|
||||||
|
xor $acc0,$t0
|
||||||
|
xor $acc1,$t1
|
||||||
|
xor $acc2,$t2
|
||||||
|
|
||||||
|
movzb `&lo($qs1)`,$acc0
|
||||||
|
movzb `&hi($qs1)`,$acc1
|
||||||
|
movzb `&hi($qs2)`,$acc2
|
||||||
|
mov 2048($sbox,$acc0,4),$acc0 #$t3
|
||||||
|
mov 2048($sbox,$acc1,4),$acc1 #$t0
|
||||||
|
mov 2048($sbox,$acc2,4),$acc2 #$t1
|
||||||
|
|
||||||
|
and \$0x00ff0000,$acc0
|
||||||
|
and \$0xff000000,$acc1
|
||||||
|
and \$0xff000000,$acc2
|
||||||
|
|
||||||
|
xor $acc0,$t3
|
||||||
|
xor $acc1,$t0
|
||||||
|
xor $acc2,$t1
|
||||||
|
|
||||||
|
movzb `&hi($qs3)`,$acc0
|
||||||
|
movzb `&hi($qs0)`,$acc1
|
||||||
|
mov 16+12($key),$s3
|
||||||
|
mov 2048($sbox,$acc0,4),$acc0 #$t2
|
||||||
|
mov 2048($sbox,$acc1,4),$acc1 #$t3
|
||||||
|
mov 16+0($key),$s0
|
||||||
|
|
||||||
|
and \$0xff000000,$acc0
|
||||||
|
and \$0xff000000,$acc1
|
||||||
|
|
||||||
|
xor $acc0,$t2
|
||||||
|
xor $acc1,$t3
|
||||||
|
|
||||||
|
mov 16+4($key),$s1
|
||||||
|
mov 16+8($key),$s2
|
||||||
|
xor $t0,$s0
|
||||||
|
xor $t1,$s1
|
||||||
|
xor $t2,$s2
|
||||||
|
xor $t3,$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
|
|
||||||
sub decstep()
|
sub decstep()
|
||||||
{ my ($i,@s) = @_;
|
{ my ($i,@s) = @_;
|
||||||
my $tmp0=$acc0;
|
my $tmp0=$acc0;
|
||||||
@ -230,7 +564,7 @@ sub decstep()
|
|||||||
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
||||||
$code.=" and \$0xFF,$out\n";
|
$code.=" and \$0xFF,$out\n";
|
||||||
|
|
||||||
$code.=" mov 0($tbl,$out,8),$out\n";
|
$code.=" mov 0($sbox,$out,8),$out\n";
|
||||||
$code.=" shr \$16,$tmp1\n";
|
$code.=" shr \$16,$tmp1\n";
|
||||||
$tmp2=$s[3] if ($i==3);
|
$tmp2=$s[3] if ($i==3);
|
||||||
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
||||||
@ -240,9 +574,9 @@ sub decstep()
|
|||||||
$code.=" and \$0xFF,$tmp1\n";
|
$code.=" and \$0xFF,$tmp1\n";
|
||||||
$code.=" shr \$24,$tmp2\n";
|
$code.=" shr \$24,$tmp2\n";
|
||||||
|
|
||||||
$code.=" xor 3($tbl,$tmp0,8),$out\n";
|
$code.=" xor 3($sbox,$tmp0,8),$out\n";
|
||||||
$code.=" xor 2($tbl,$tmp1,8),$out\n";
|
$code.=" xor 2($sbox,$tmp1,8),$out\n";
|
||||||
$code.=" xor 1($tbl,$tmp2,8),$out\n";
|
$code.=" xor 1($sbox,$tmp2,8),$out\n";
|
||||||
|
|
||||||
$code.=" mov $t2,$s[1]\n" if ($i==3);
|
$code.=" mov $t2,$s[1]\n" if ($i==3);
|
||||||
$code.=" mov $t1,$s[2]\n" if ($i==3);
|
$code.=" mov $t1,$s[2]\n" if ($i==3);
|
||||||
@ -262,7 +596,7 @@ sub declast()
|
|||||||
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
$code.=" mov $s[2],$tmp1\n" if ($i!=3);
|
||||||
$code.=" and \$0xFF,$out\n";
|
$code.=" and \$0xFF,$out\n";
|
||||||
|
|
||||||
$code.=" mov 2048($tbl,$out,4),$out\n";
|
$code.=" mov 2048($sbox,$out,4),$out\n";
|
||||||
$code.=" shr \$16,$tmp1\n";
|
$code.=" shr \$16,$tmp1\n";
|
||||||
$tmp2=$s[3] if ($i==3);
|
$tmp2=$s[3] if ($i==3);
|
||||||
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
$code.=" mov $s[3],$tmp2\n" if ($i!=3);
|
||||||
@ -273,9 +607,9 @@ sub declast()
|
|||||||
$code.=" and \$0xFF,$tmp1\n";
|
$code.=" and \$0xFF,$tmp1\n";
|
||||||
$code.=" shr \$24,$tmp2\n";
|
$code.=" shr \$24,$tmp2\n";
|
||||||
|
|
||||||
$code.=" mov 2048($tbl,$tmp0,4),$tmp0\n";
|
$code.=" mov 2048($sbox,$tmp0,4),$tmp0\n";
|
||||||
$code.=" mov 2048($tbl,$tmp1,4),$tmp1\n";
|
$code.=" mov 2048($sbox,$tmp1,4),$tmp1\n";
|
||||||
$code.=" mov 2048($tbl,$tmp2,4),$tmp2\n";
|
$code.=" mov 2048($sbox,$tmp2,4),$tmp2\n";
|
||||||
|
|
||||||
$code.=" and \$0x0000ff00,$tmp0\n";
|
$code.=" and \$0x0000ff00,$tmp0\n";
|
||||||
$code.=" and \$0x00ff0000,$tmp1\n";
|
$code.=" and \$0x00ff0000,$tmp1\n";
|
||||||
@ -299,36 +633,42 @@ _x86_64_AES_decrypt:
|
|||||||
xor 8($key),$s2
|
xor 8($key),$s2
|
||||||
xor 12($key),$s3
|
xor 12($key),$s3
|
||||||
|
|
||||||
mov 240($key),$cnt # load key->rounds
|
mov 240($key),$rnds # load key->rounds
|
||||||
sub \$1,$cnt
|
sub \$1,$rnds
|
||||||
.align 4
|
jmp .Ldec_loop
|
||||||
|
.align 16
|
||||||
.Ldec_loop:
|
.Ldec_loop:
|
||||||
___
|
___
|
||||||
&decstep(0,$s0,$s3,$s2,$s1);
|
if ($verticalspin) { &decvert(); }
|
||||||
&decstep(1,$s1,$s0,$s3,$s2);
|
else { &decstep(0,$s0,$s3,$s2,$s1);
|
||||||
&decstep(2,$s2,$s1,$s0,$s3);
|
&decstep(1,$s1,$s0,$s3,$s2);
|
||||||
&decstep(3,$s3,$s2,$s1,$s0);
|
&decstep(2,$s2,$s1,$s0,$s3);
|
||||||
|
&decstep(3,$s3,$s2,$s1,$s0);
|
||||||
|
$code.=<<___;
|
||||||
|
lea 16($key),$key
|
||||||
|
xor 0($key),$s0 # xor with key
|
||||||
|
xor 4($key),$s1
|
||||||
|
xor 8($key),$s2
|
||||||
|
xor 12($key),$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
lea 16($key),$key
|
sub \$1,$rnds
|
||||||
xor 0($key),$s0 # xor with key
|
|
||||||
xor 4($key),$s1
|
|
||||||
xor 8($key),$s2
|
|
||||||
xor 12($key),$s3
|
|
||||||
|
|
||||||
sub \$1,$cnt
|
|
||||||
jnz .Ldec_loop
|
jnz .Ldec_loop
|
||||||
___
|
___
|
||||||
&declast(0,$s0,$s3,$s2,$s1);
|
if ($verticalspin) { &declastvert(); }
|
||||||
&declast(1,$s1,$s0,$s3,$s2);
|
else { &declast(0,$s0,$s3,$s2,$s1);
|
||||||
&declast(2,$s2,$s1,$s0,$s3);
|
&declast(1,$s1,$s0,$s3,$s2);
|
||||||
&declast(3,$s3,$s2,$s1,$s0);
|
&declast(2,$s2,$s1,$s0,$s3);
|
||||||
|
&declast(3,$s3,$s2,$s1,$s0);
|
||||||
|
$code.=<<___;
|
||||||
|
xor 16+0($key),$s0 # xor with key
|
||||||
|
xor 4($key),$s1
|
||||||
|
xor 8($key),$s2
|
||||||
|
xor 12($key),$s3
|
||||||
|
___
|
||||||
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
lea 16($key),$key
|
|
||||||
xor 0($key),$s0 # xor with key
|
|
||||||
xor 4($key),$s1
|
|
||||||
xor 8($key),$s2
|
|
||||||
xor 12($key),$s3
|
|
||||||
|
|
||||||
.byte 0xf3,0xc3 # rep ret
|
.byte 0xf3,0xc3 # rep ret
|
||||||
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
|
.size _x86_64_AES_decrypt,.-_x86_64_AES_decrypt
|
||||||
___
|
___
|
||||||
@ -347,9 +687,11 @@ AES_decrypt:
|
|||||||
push %r15
|
push %r15
|
||||||
|
|
||||||
mov %rdx,$key
|
mov %rdx,$key
|
||||||
|
mov %rdi,$inp
|
||||||
|
mov %rsi,$out
|
||||||
|
|
||||||
.picmeup $tbl
|
.picmeup $sbox
|
||||||
lea AES_Td-.($tbl),$tbl
|
lea AES_Td-.($sbox),$sbox
|
||||||
|
|
||||||
mov 0($inp),$s0
|
mov 0($inp),$s0
|
||||||
mov 4($inp),$s1
|
mov 4($inp),$s1
|
||||||
@ -719,24 +1061,24 @@ AES_cbc_encrypt:
|
|||||||
pushfq
|
pushfq
|
||||||
cld
|
cld
|
||||||
|
|
||||||
.picmeup $tbl
|
.picmeup $sbox
|
||||||
.Lcbc_pic_point:
|
.Lcbc_pic_point:
|
||||||
|
|
||||||
cmp \$0,%r9
|
cmp \$0,%r9
|
||||||
je .LDECRYPT
|
je .LDECRYPT
|
||||||
|
|
||||||
lea AES_Te-.Lcbc_pic_point($tbl),$tbl
|
lea AES_Te-.Lcbc_pic_point($sbox),$sbox
|
||||||
|
|
||||||
# allocate aligned stack frame...
|
# allocate aligned stack frame...
|
||||||
lea -64-248(%rsp),$key
|
lea -64-248(%rsp),$key
|
||||||
and \$-64,$key
|
and \$-64,$key
|
||||||
|
|
||||||
# ... and make it doesn't alias with AES_Te modulo 4096
|
# ... and make it doesn't alias with AES_Te modulo 4096
|
||||||
mov $tbl,%r10
|
mov $sbox,%r10
|
||||||
lea 2048($tbl),%r11
|
lea 2048($sbox),%r11
|
||||||
mov $key,%r12
|
mov $key,%r12
|
||||||
and \$0xFFF,%r10 # s = $tbl&0xfff
|
and \$0xFFF,%r10 # s = $sbox&0xfff
|
||||||
and \$0xFFF,%r11 # e = ($tbl+2048)&0xfff
|
and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
|
||||||
and \$0xFFF,%r12 # p = %rsp&0xfff
|
and \$0xFFF,%r12 # p = %rsp&0xfff
|
||||||
|
|
||||||
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
|
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
|
||||||
@ -758,13 +1100,15 @@ AES_cbc_encrypt:
|
|||||||
mov %rdx,$_len # save copy of len
|
mov %rdx,$_len # save copy of len
|
||||||
mov %rcx,$_key # save copy of key
|
mov %rcx,$_key # save copy of key
|
||||||
mov %r8,$_ivp # save copy of ivp
|
mov %r8,$_ivp # save copy of ivp
|
||||||
|
|
||||||
movl \$0,$mark # copy of aes_key->rounds = 0;
|
movl \$0,$mark # copy of aes_key->rounds = 0;
|
||||||
|
mov %r8,%rbp # rearrange input arguments
|
||||||
|
mov %rsi,$out
|
||||||
|
mov %rdi,$inp
|
||||||
mov %rcx,$key
|
mov %rcx,$key
|
||||||
|
|
||||||
# do we copy key schedule to stack?
|
# do we copy key schedule to stack?
|
||||||
mov $key,%r10
|
mov $key,%r10
|
||||||
sub $tbl,%r10
|
sub $sbox,%r10
|
||||||
and \$0xfff,%r10
|
and \$0xfff,%r10
|
||||||
cmp \$2048,%r10
|
cmp \$2048,%r10
|
||||||
jb .Lcbc_do_ecopy
|
jb .Lcbc_do_ecopy
|
||||||
@ -772,8 +1116,6 @@ AES_cbc_encrypt:
|
|||||||
jb .Lcbc_skip_ecopy
|
jb .Lcbc_skip_ecopy
|
||||||
.align 4
|
.align 4
|
||||||
.Lcbc_do_ecopy:
|
.Lcbc_do_ecopy:
|
||||||
mov %rsi,%r10 # backup $inp,$out
|
|
||||||
mov %rdi,%r11
|
|
||||||
mov $key,%rsi
|
mov $key,%rsi
|
||||||
lea $aes_key,%rdi
|
lea $aes_key,%rdi
|
||||||
lea $aes_key,$key
|
lea $aes_key,$key
|
||||||
@ -781,29 +1123,27 @@ AES_cbc_encrypt:
|
|||||||
.long 0x90A548F3 # rep movsq
|
.long 0x90A548F3 # rep movsq
|
||||||
mov (%rsi),%eax # copy aes_key->rounds
|
mov (%rsi),%eax # copy aes_key->rounds
|
||||||
mov %eax,(%rdi)
|
mov %eax,(%rdi)
|
||||||
mov %r10,%rsi # restore $inp,$out
|
|
||||||
mov %r11,%rdi
|
|
||||||
.Lcbc_skip_ecopy:
|
.Lcbc_skip_ecopy:
|
||||||
mov $key,$keyp # save key pointer
|
mov $key,$keyp # save key pointer
|
||||||
|
|
||||||
mov \$16,%ecx
|
mov \$16,%ecx
|
||||||
.align 4
|
.align 4
|
||||||
.Lcbc_prefetch_te:
|
.Lcbc_prefetch_te:
|
||||||
mov 0($tbl),%r10
|
mov 0($sbox),%r10
|
||||||
mov 32($tbl),%r11
|
mov 32($sbox),%r11
|
||||||
mov 64($tbl),%r12
|
mov 64($sbox),%r12
|
||||||
mov 96($tbl),%r13
|
mov 96($sbox),%r13
|
||||||
lea 128($tbl),$tbl
|
lea 128($sbox),$sbox
|
||||||
sub \$1,%ecx
|
sub \$1,%ecx
|
||||||
jnz .Lcbc_prefetch_te
|
jnz .Lcbc_prefetch_te
|
||||||
sub \$2048,$tbl
|
sub \$2048,$sbox
|
||||||
|
|
||||||
test \$-16,%rdx # check upon length
|
test \$-16,%rdx # check upon length
|
||||||
mov %rdx,%r10
|
mov %rdx,%r10
|
||||||
mov 0(%r8),$s0 # load iv
|
mov 0(%rbp),$s0 # load iv
|
||||||
mov 4(%r8),$s1
|
mov 4(%rbp),$s1
|
||||||
mov 8(%r8),$s2
|
mov 8(%rbp),$s2
|
||||||
mov 12(%r8),$s3
|
mov 12(%rbp),$s3
|
||||||
jz .Lcbc_enc_tail # short input...
|
jz .Lcbc_enc_tail # short input...
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
@ -812,10 +1152,12 @@ AES_cbc_encrypt:
|
|||||||
xor 4($inp),$s1
|
xor 4($inp),$s1
|
||||||
xor 8($inp),$s2
|
xor 8($inp),$s2
|
||||||
xor 12($inp),$s3
|
xor 12($inp),$s3
|
||||||
|
mov $inp,$ivec # if ($verticalspin) save inp
|
||||||
|
|
||||||
mov $keyp,$key # restore key
|
mov $keyp,$key # restore key
|
||||||
call _x86_64_AES_encrypt
|
call _x86_64_AES_encrypt
|
||||||
|
|
||||||
|
mov $ivec,$inp # if ($verticalspin) restore inp
|
||||||
mov $s0,0($out)
|
mov $s0,0($out)
|
||||||
mov $s1,4($out)
|
mov $s1,4($out)
|
||||||
mov $s2,8($out)
|
mov $s2,8($out)
|
||||||
@ -830,11 +1172,11 @@ AES_cbc_encrypt:
|
|||||||
jnz .Lcbc_enc_loop
|
jnz .Lcbc_enc_loop
|
||||||
test \$15,%r10
|
test \$15,%r10
|
||||||
jnz .Lcbc_enc_tail
|
jnz .Lcbc_enc_tail
|
||||||
mov $_ivp,%r10 # restore ivp
|
mov $_ivp,%rbp # restore ivp
|
||||||
mov $s0,0(%r10) # save ivec
|
mov $s0,0(%rbp) # save ivec
|
||||||
mov $s1,4(%r10)
|
mov $s1,4(%rbp)
|
||||||
mov $s2,8(%r10)
|
mov $s2,8(%rbp)
|
||||||
mov $s3,12(%r10)
|
mov $s3,12(%rbp)
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
.Lcbc_cleanup:
|
.Lcbc_cleanup:
|
||||||
@ -858,36 +1200,34 @@ AES_cbc_encrypt:
|
|||||||
.align 4
|
.align 4
|
||||||
.Lcbc_enc_tail:
|
.Lcbc_enc_tail:
|
||||||
cmp $inp,$out
|
cmp $inp,$out
|
||||||
mov $inp,%r11
|
|
||||||
mov $out,%r12
|
|
||||||
je .Lcbc_enc_in_place
|
je .Lcbc_enc_in_place
|
||||||
mov %r10,%rcx
|
mov %r10,%rcx
|
||||||
xchg %rsi,%rdi
|
mov $inp,%rsi
|
||||||
|
mov $out,%rdi
|
||||||
.long 0xF689A4F3 # rep movsb
|
.long 0xF689A4F3 # rep movsb
|
||||||
.Lcbc_enc_in_place:
|
.Lcbc_enc_in_place:
|
||||||
mov \$16,%rcx # zero tail
|
mov \$16,%rcx # zero tail
|
||||||
sub %r10,%rcx
|
sub %r10,%rcx
|
||||||
xor %rax,%rax
|
xor %rax,%rax
|
||||||
.long 0xF689AAF3 # rep stosb
|
.long 0xF689AAF3 # rep stosb
|
||||||
mov %r12,$inp # this is not a mistake!
|
mov $out,$inp # this is not a mistake!
|
||||||
mov %r12,$out
|
|
||||||
movq \$16,$_len # len=16
|
movq \$16,$_len # len=16
|
||||||
jmp .Lcbc_enc_loop # one more spin...
|
jmp .Lcbc_enc_loop # one more spin...
|
||||||
#----------------------------- DECRYPT -----------------------------#
|
#----------------------------- DECRYPT -----------------------------#
|
||||||
.align 16
|
.align 16
|
||||||
.LDECRYPT:
|
.LDECRYPT:
|
||||||
lea AES_Td-.Lcbc_pic_point($tbl),$tbl
|
lea AES_Td-.Lcbc_pic_point($sbox),$sbox
|
||||||
|
|
||||||
# allocate aligned stack frame...
|
# allocate aligned stack frame...
|
||||||
lea -64-248(%rsp),$key
|
lea -64-248(%rsp),$key
|
||||||
and \$-64,$key
|
and \$-64,$key
|
||||||
|
|
||||||
# ... and make it doesn't alias with AES_Td modulo 4096
|
# ... and make it doesn't alias with AES_Td modulo 4096
|
||||||
mov $tbl,%r10
|
mov $sbox,%r10
|
||||||
lea 3072($tbl),%r11
|
lea 3072($sbox),%r11
|
||||||
mov $key,%r12
|
mov $key,%r12
|
||||||
and \$0xFFF,%r10 # s = $tbl&0xfff
|
and \$0xFFF,%r10 # s = $sbox&0xfff
|
||||||
and \$0xFFF,%r11 # e = ($tbl+2048)&0xfff
|
and \$0xFFF,%r11 # e = ($sbox+2048)&0xfff
|
||||||
and \$0xFFF,%r12 # p = %rsp&0xfff
|
and \$0xFFF,%r12 # p = %rsp&0xfff
|
||||||
|
|
||||||
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
|
cmp %r11,%r12 # if (p=>e) %rsp =- (p-e);
|
||||||
@ -909,13 +1249,15 @@ AES_cbc_encrypt:
|
|||||||
mov %rdx,$_len # save copy of len
|
mov %rdx,$_len # save copy of len
|
||||||
mov %rcx,$_key # save copy of key
|
mov %rcx,$_key # save copy of key
|
||||||
mov %r8,$_ivp # save copy of ivp
|
mov %r8,$_ivp # save copy of ivp
|
||||||
|
|
||||||
movl \$0,$mark # copy of aes_key->rounds = 0;
|
movl \$0,$mark # copy of aes_key->rounds = 0;
|
||||||
|
mov %r8,%rbp # rearrange input arguments
|
||||||
|
mov %rsi,$out
|
||||||
|
mov %rdi,$inp
|
||||||
mov %rcx,$key
|
mov %rcx,$key
|
||||||
|
|
||||||
# do we copy key schedule to stack?
|
# do we copy key schedule to stack?
|
||||||
mov $key,%r10
|
mov $key,%r10
|
||||||
sub $tbl,%r10
|
sub $sbox,%r10
|
||||||
and \$0xfff,%r10
|
and \$0xfff,%r10
|
||||||
cmp \$3072,%r10
|
cmp \$3072,%r10
|
||||||
jb .Lcbc_do_dcopy
|
jb .Lcbc_do_dcopy
|
||||||
@ -923,8 +1265,6 @@ AES_cbc_encrypt:
|
|||||||
jb .Lcbc_skip_dcopy
|
jb .Lcbc_skip_dcopy
|
||||||
.align 4
|
.align 4
|
||||||
.Lcbc_do_dcopy:
|
.Lcbc_do_dcopy:
|
||||||
mov %rsi,%r10 # backup $inp,$out
|
|
||||||
mov %rdi,%r11
|
|
||||||
mov $key,%rsi
|
mov $key,%rsi
|
||||||
lea $aes_key,%rdi
|
lea $aes_key,%rdi
|
||||||
lea $aes_key,$key
|
lea $aes_key,$key
|
||||||
@ -932,51 +1272,51 @@ AES_cbc_encrypt:
|
|||||||
.long 0x90A548F3 # rep movsq
|
.long 0x90A548F3 # rep movsq
|
||||||
mov (%rsi),%eax # copy aes_key->rounds
|
mov (%rsi),%eax # copy aes_key->rounds
|
||||||
mov %eax,(%rdi)
|
mov %eax,(%rdi)
|
||||||
mov %r10,%rsi # restore $inp,$out
|
|
||||||
mov %r11,%rdi
|
|
||||||
.Lcbc_skip_dcopy:
|
.Lcbc_skip_dcopy:
|
||||||
mov $key,$keyp # save key pointer
|
mov $key,$keyp # save key pointer
|
||||||
|
|
||||||
mov \$24,%ecx
|
mov \$24,%ecx
|
||||||
.align 4
|
.align 4
|
||||||
.Lcbc_prefetch_td:
|
.Lcbc_prefetch_td:
|
||||||
mov 0($tbl),%r10
|
mov 0($sbox),%r10
|
||||||
mov 32($tbl),%r11
|
mov 32($sbox),%r11
|
||||||
mov 64($tbl),%r12
|
mov 64($sbox),%r12
|
||||||
mov 96($tbl),%r13
|
mov 96($sbox),%r13
|
||||||
lea 128($tbl),$tbl
|
lea 128($sbox),$sbox
|
||||||
sub \$1,%ecx
|
sub \$1,%ecx
|
||||||
jnz .Lcbc_prefetch_td
|
jnz .Lcbc_prefetch_td
|
||||||
sub \$3072,$tbl
|
sub \$3072,$sbox
|
||||||
|
|
||||||
cmp $inp,$out
|
cmp $inp,$out
|
||||||
je .Lcbc_dec_in_place
|
je .Lcbc_dec_in_place
|
||||||
|
|
||||||
mov %r8,$ivec
|
mov %rbp,$ivec
|
||||||
.align 4
|
.align 4
|
||||||
.Lcbc_dec_loop:
|
.Lcbc_dec_loop:
|
||||||
mov 0($inp),$s0 # read input
|
mov 0($inp),$s0 # read input
|
||||||
mov 4($inp),$s1
|
mov 4($inp),$s1
|
||||||
mov 8($inp),$s2
|
mov 8($inp),$s2
|
||||||
mov 12($inp),$s3
|
mov 12($inp),$s3
|
||||||
|
mov $inp,8+$ivec # if ($verticalspin) save inp
|
||||||
|
|
||||||
mov $keyp,$key # load key
|
mov $keyp,$key # restore key
|
||||||
call _x86_64_AES_decrypt
|
call _x86_64_AES_decrypt
|
||||||
|
|
||||||
mov $ivec,%r8 # load ivp
|
mov $ivec,%rbp # load ivp
|
||||||
xor 0(%r8),$s0 # xor iv
|
mov 8+$ivec,$inp # if ($verticalspin) restore inp
|
||||||
xor 4(%r8),$s1
|
xor 0(%rbp),$s0 # xor iv
|
||||||
xor 8(%r8),$s2
|
xor 4(%rbp),$s1
|
||||||
xor 12(%r8),$s3
|
xor 8(%rbp),$s2
|
||||||
mov $inp,%r8 # current input, next iv
|
xor 12(%rbp),$s3
|
||||||
|
mov $inp,%rbp # current input, next iv
|
||||||
|
|
||||||
mov $_len,%r10 # load len
|
mov $_len,%r10 # load len
|
||||||
sub \$16,%r10
|
sub \$16,%r10
|
||||||
jc .Lcbc_dec_partial
|
jc .Lcbc_dec_partial
|
||||||
mov %r10,$_len # update len
|
mov %r10,$_len # update len
|
||||||
mov %r8,$ivec # update ivp
|
mov %rbp,$ivec # update ivp
|
||||||
|
|
||||||
mov $s0,0($out) # write output
|
mov $s0,0($out) # write output
|
||||||
mov $s1,4($out)
|
mov $s1,4($out)
|
||||||
mov $s2,8($out)
|
mov $s2,8($out)
|
||||||
mov $s3,12($out)
|
mov $s3,12($out)
|
||||||
@ -985,11 +1325,11 @@ AES_cbc_encrypt:
|
|||||||
lea 16($out),$out
|
lea 16($out),$out
|
||||||
jnz .Lcbc_dec_loop
|
jnz .Lcbc_dec_loop
|
||||||
.Lcbc_dec_end:
|
.Lcbc_dec_end:
|
||||||
mov $_ivp,%r9 # load user ivp
|
mov $_ivp,%r12 # load user ivp
|
||||||
mov 0(%r8),%r10 # load iv
|
mov 0(%rbp),%r10 # load iv
|
||||||
mov 8(%r8),%r11
|
mov 8(%rbp),%r11
|
||||||
mov %r10,0(%r9) # copy back to user
|
mov %r10,0(%r12) # copy back to user
|
||||||
mov %r11,8(%r9)
|
mov %r11,8(%r12)
|
||||||
jmp .Lcbc_cleanup
|
jmp .Lcbc_cleanup
|
||||||
|
|
||||||
.align 4
|
.align 4
|
||||||
@ -1007,26 +1347,28 @@ AES_cbc_encrypt:
|
|||||||
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lcbc_dec_in_place:
|
.Lcbc_dec_in_place:
|
||||||
mov 0($inp),$s0 # load input
|
mov 0($inp),$s0 # load input
|
||||||
mov 4($inp),$s1
|
mov 4($inp),$s1
|
||||||
mov 8($inp),$s2
|
mov 8($inp),$s2
|
||||||
mov 12($inp),$s3
|
mov 12($inp),$s3
|
||||||
|
|
||||||
|
mov $inp,$ivec # if ($verticalspin) save inp
|
||||||
mov $keyp,$key
|
mov $keyp,$key
|
||||||
call _x86_64_AES_decrypt
|
call _x86_64_AES_decrypt
|
||||||
|
|
||||||
mov $_ivp,%r8
|
mov $ivec,$inp # if ($verticalspin) restore inp
|
||||||
xor 0(%r8),$s0
|
mov $_ivp,%rbp
|
||||||
xor 4(%r8),$s1
|
xor 0(%rbp),$s0
|
||||||
xor 8(%r8),$s2
|
xor 4(%rbp),$s1
|
||||||
xor 12(%r8),$s3
|
xor 8(%rbp),$s2
|
||||||
|
xor 12(%rbp),$s3
|
||||||
|
|
||||||
mov 0($inp),%r10 # copy input to iv
|
mov 0($inp),%r10 # copy input to iv
|
||||||
mov 8($inp),%r11
|
mov 8($inp),%r11
|
||||||
mov %r10,0(%r8)
|
mov %r10,0(%rbp)
|
||||||
mov %r11,8(%r8)
|
mov %r11,8(%rbp)
|
||||||
|
|
||||||
mov $s0,0($out) # save output [zaps input]
|
mov $s0,0($out) # save output [zaps input]
|
||||||
mov $s1,4($out)
|
mov $s1,4($out)
|
||||||
mov $s2,8($out)
|
mov $s2,8($out)
|
||||||
mov $s3,12($out)
|
mov $s3,12($out)
|
||||||
@ -1044,7 +1386,7 @@ AES_cbc_encrypt:
|
|||||||
.Lcbc_dec_in_place_partial:
|
.Lcbc_dec_in_place_partial:
|
||||||
# one can argue if this is actually required
|
# one can argue if this is actually required
|
||||||
lea ($out,%rcx),%rdi
|
lea ($out,%rcx),%rdi
|
||||||
lea (%r8,%rcx),%rsi
|
lea (%rbp,%rcx),%rsi
|
||||||
neg %rcx
|
neg %rcx
|
||||||
.long 0xF689A4F3 # rep movsb # restore tail
|
.long 0xF689A4F3 # rep movsb # restore tail
|
||||||
jmp .Lcbc_cleanup
|
jmp .Lcbc_cleanup
|
||||||
@ -1262,6 +1604,8 @@ ___
|
|||||||
&data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
|
&data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363);
|
||||||
&data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
|
&data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d);
|
||||||
|
|
||||||
|
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||||||
|
|
||||||
print $code;
|
print $code;
|
||||||
|
|
||||||
close STDOUT;
|
close STDOUT;
|
||||||
|
@ -167,10 +167,12 @@ my $current_function;
|
|||||||
my $self = shift;
|
my $self = shift;
|
||||||
my $sz = shift;
|
my $sz = shift;
|
||||||
|
|
||||||
# silently convert all EAs to 64-bit, required for elder GNU
|
# Silently convert all EAs to 64-bit. This is required for
|
||||||
# assembler and results in more compact code
|
# elder GNU assembler and results in more compact code,
|
||||||
$self->{index} =~ s/^[er](.?[0-9xp])[d]?$/r\1/;
|
# *but* most importantly AES module depends on this feature!
|
||||||
$self->{base} =~ s/^[er](.?[0-9xp])[d]?$/r\1/;
|
$self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
|
||||||
|
$self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/;
|
||||||
|
|
||||||
if (!$masm) {
|
if (!$masm) {
|
||||||
# Solaris /usr/ccs/bin/as can't handle multiplications
|
# Solaris /usr/ccs/bin/as can't handle multiplications
|
||||||
# in $self->{label}
|
# in $self->{label}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user