aesni-x86_64.pl: optimize XTS.
PR: 3042
This commit is contained in:
parent
4df2280b4f
commit
36df342f9b
@ -153,14 +153,14 @@
|
|||||||
|
|
||||||
# April 2011
|
# April 2011
|
||||||
#
|
#
|
||||||
# Add aesni_xts_[en|de]crypt. Westmere spends 1.33 cycles processing
|
# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
|
||||||
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
|
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
|
||||||
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
||||||
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
|
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
|
||||||
# in ECB, 0.71 in CTR, 0.95 in XTS... This means that aes[enc|dec]
|
# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
|
||||||
# instruction latency is 9 cycles and that they can be issued every
|
# instruction latency is 9 cycles and that they can be issued every
|
||||||
# cycle.
|
# cycle.
|
||||||
|
|
||||||
@ -1430,7 +1430,7 @@ ___
|
|||||||
my @tweak=map("%xmm$_",(10..15));
|
my @tweak=map("%xmm$_",(10..15));
|
||||||
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
|
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
|
||||||
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
|
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
|
||||||
my $frame_size = 0x60 + ($win64?160:0);
|
my $frame_size = 0x70 + ($win64?160:0);
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.globl aesni_xts_encrypt
|
.globl aesni_xts_encrypt
|
||||||
@ -1464,213 +1464,251 @@ ___
|
|||||||
# generate the tweak
|
# generate the tweak
|
||||||
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
|
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
$movkey ($key),$rndkey0 # zero round key
|
||||||
mov $key,$key_ # backup $key
|
mov $key,$key_ # backup $key
|
||||||
mov $rnds_,$rounds # backup $rounds
|
mov $rnds_,$rounds # backup $rounds
|
||||||
|
shl \$4,$rnds_
|
||||||
mov $len,$len_ # backup $len
|
mov $len,$len_ # backup $len
|
||||||
and \$-16,$len
|
and \$-16,$len
|
||||||
|
|
||||||
|
$movkey 16($key,$rnds_),$rndkey1 # last round key
|
||||||
|
mov $rounds,$rnds_
|
||||||
|
|
||||||
movdqa .Lxts_magic(%rip),$twmask
|
movdqa .Lxts_magic(%rip),$twmask
|
||||||
pxor $twtmp,$twtmp
|
pshufd \$0x5f,@tweak[5],$twres
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
|
pxor $rndkey0,$rndkey1
|
||||||
___
|
___
|
||||||
|
# alternative tweak calculation algorithm is based on suggestions
|
||||||
|
# by Shay Gueron. psrad doesn't conflict with AES-NI instructions
|
||||||
|
# and should help in the future...
|
||||||
for ($i=0;$i<4;$i++) {
|
for ($i=0;$i<4;$i++) {
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa $twres,$twtmp
|
||||||
pxor $twtmp,$twtmp
|
paddd $twres,$twres
|
||||||
movdqa @tweak[5],@tweak[$i]
|
movdqa @tweak[5],@tweak[$i]
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
psrad \$31,$twtmp # broadcast upper bits
|
||||||
pand $twmask,$twres # isolate carry and residue
|
paddq @tweak[5],@tweak[5]
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
pand $twmask,$twtmp
|
||||||
pxor $twres,@tweak[5]
|
pxor $rndkey0,@tweak[$i]
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
movdqa @tweak[5],@tweak[4]
|
||||||
|
psrad \$31,$twres
|
||||||
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twres
|
||||||
|
pxor $rndkey0,@tweak[4]
|
||||||
|
pxor $twres,@tweak[5]
|
||||||
|
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
|
||||||
|
|
||||||
sub \$16*6,$len
|
sub \$16*6,$len
|
||||||
jc .Lxts_enc_short
|
jc .Lxts_enc_short
|
||||||
|
|
||||||
shr \$1,$rounds
|
shr \$1,$rounds
|
||||||
sub \$1,$rounds
|
sub \$3,$rounds
|
||||||
|
$movkey 16($key_),$rndkey1
|
||||||
mov $rounds,$rnds_
|
mov $rounds,$rnds_
|
||||||
|
lea .Lxts_magic(%rip),%r8
|
||||||
jmp .Lxts_enc_grandloop
|
jmp .Lxts_enc_grandloop
|
||||||
|
|
||||||
.align 16
|
.align 32
|
||||||
.Lxts_enc_grandloop:
|
.Lxts_enc_grandloop:
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
movdqa @tweak[5],@tweak[4]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
movdqu `16*0`($inp),$inout0 # load input
|
movdqu `16*0`($inp),$inout0 # load input
|
||||||
pand $twmask,$twres # isolate carry and residue
|
movdqa $rndkey0,$twmask
|
||||||
movdqu `16*1`($inp),$inout1
|
movdqu `16*1`($inp),$inout1
|
||||||
pxor $twres,@tweak[5]
|
pxor @tweak[0],$inout0
|
||||||
|
|
||||||
movdqu `16*2`($inp),$inout2
|
movdqu `16*2`($inp),$inout2
|
||||||
pxor @tweak[0],$inout0 # input^=tweak
|
|
||||||
movdqu `16*3`($inp),$inout3
|
|
||||||
pxor @tweak[1],$inout1
|
pxor @tweak[1],$inout1
|
||||||
movdqu `16*4`($inp),$inout4
|
|
||||||
pxor @tweak[2],$inout2
|
|
||||||
movdqu `16*5`($inp),$inout5
|
|
||||||
lea `16*6`($inp),$inp
|
|
||||||
pxor @tweak[3],$inout3
|
|
||||||
$movkey ($key_),$rndkey0
|
|
||||||
pxor @tweak[4],$inout4
|
|
||||||
pxor @tweak[5],$inout5
|
|
||||||
|
|
||||||
# inline _aesni_encrypt6 and interleave first and last rounds
|
|
||||||
# with own code...
|
|
||||||
$movkey 16($key_),$rndkey1
|
|
||||||
pxor $rndkey0,$inout0
|
|
||||||
pxor $rndkey0,$inout1
|
|
||||||
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
|
|
||||||
aesenc $rndkey1,$inout0
|
aesenc $rndkey1,$inout0
|
||||||
lea 32($key_),$key
|
movdqu `16*3`($inp),$inout3
|
||||||
pxor $rndkey0,$inout2
|
pxor @tweak[2],$inout2
|
||||||
movdqa @tweak[1],`16*1`(%rsp)
|
|
||||||
aesenc $rndkey1,$inout1
|
aesenc $rndkey1,$inout1
|
||||||
pxor $rndkey0,$inout3
|
movdqu `16*4`($inp),$inout4
|
||||||
movdqa @tweak[2],`16*2`(%rsp)
|
pxor @tweak[3],$inout3
|
||||||
aesenc $rndkey1,$inout2
|
aesenc $rndkey1,$inout2
|
||||||
pxor $rndkey0,$inout4
|
movdqu `16*5`($inp),$inout5
|
||||||
movdqa @tweak[3],`16*3`(%rsp)
|
pxor @tweak[5],$twmask # round[0]^=tweak[5]
|
||||||
|
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
|
||||||
|
pxor @tweak[4],$inout4
|
||||||
aesenc $rndkey1,$inout3
|
aesenc $rndkey1,$inout3
|
||||||
pxor $rndkey0,$inout5
|
$movkey 32($key_),$rndkey0
|
||||||
$movkey ($key),$rndkey0
|
lea `16*6`($inp),$inp
|
||||||
dec $rounds
|
pxor $twmask,$inout5
|
||||||
movdqa @tweak[4],`16*4`(%rsp)
|
|
||||||
aesenc $rndkey1,$inout4
|
|
||||||
movdqa @tweak[5],`16*5`(%rsp)
|
|
||||||
aesenc $rndkey1,$inout5
|
|
||||||
pxor $twtmp,$twtmp
|
|
||||||
pcmpgtd @tweak[5],$twtmp
|
|
||||||
jmp .Lxts_enc_loop6_enter
|
|
||||||
|
|
||||||
.align 16
|
pxor $twres,@tweak[0]
|
||||||
|
aesenc $rndkey1,$inout4
|
||||||
|
pxor $twres,@tweak[1]
|
||||||
|
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
|
||||||
|
aesenc $rndkey1,$inout5
|
||||||
|
$movkey 48($key_),$rndkey1
|
||||||
|
|
||||||
|
aesenc $rndkey0,$inout0
|
||||||
|
pxor $twres,@tweak[2]
|
||||||
|
movdqa @tweak[1],`16*1`(%rsp)
|
||||||
|
aesenc $rndkey0,$inout1
|
||||||
|
pxor $twres,@tweak[3]
|
||||||
|
movdqa @tweak[2],`16*2`(%rsp)
|
||||||
|
aesenc $rndkey0,$inout2
|
||||||
|
pxor $twres,@tweak[4]
|
||||||
|
aesenc $rndkey0,$inout3
|
||||||
|
pxor $twres,$twmask
|
||||||
|
movdqa @tweak[4],`16*4`(%rsp)
|
||||||
|
aesenc $rndkey0,$inout4
|
||||||
|
movdqa $twmask,`16*5`(%rsp)
|
||||||
|
aesenc $rndkey0,$inout5
|
||||||
|
$movkey 64($key_),$rndkey0
|
||||||
|
lea 64($key_),$key
|
||||||
|
pshufd \$0x5f,@tweak[5],$twres
|
||||||
|
jmp .Lxts_enc_loop6
|
||||||
|
.align 32
|
||||||
.Lxts_enc_loop6:
|
.Lxts_enc_loop6:
|
||||||
aesenc $rndkey1,$inout0
|
aesenc $rndkey1,$inout0
|
||||||
aesenc $rndkey1,$inout1
|
aesenc $rndkey1,$inout1
|
||||||
dec $rounds
|
|
||||||
aesenc $rndkey1,$inout2
|
aesenc $rndkey1,$inout2
|
||||||
aesenc $rndkey1,$inout3
|
aesenc $rndkey1,$inout3
|
||||||
aesenc $rndkey1,$inout4
|
aesenc $rndkey1,$inout4
|
||||||
aesenc $rndkey1,$inout5
|
aesenc $rndkey1,$inout5
|
||||||
.Lxts_enc_loop6_enter:
|
|
||||||
$movkey 16($key),$rndkey1
|
$movkey 16($key),$rndkey1
|
||||||
|
lea 32($key),$key
|
||||||
|
|
||||||
aesenc $rndkey0,$inout0
|
aesenc $rndkey0,$inout0
|
||||||
aesenc $rndkey0,$inout1
|
aesenc $rndkey0,$inout1
|
||||||
lea 32($key),$key
|
|
||||||
aesenc $rndkey0,$inout2
|
aesenc $rndkey0,$inout2
|
||||||
aesenc $rndkey0,$inout3
|
aesenc $rndkey0,$inout3
|
||||||
aesenc $rndkey0,$inout4
|
aesenc $rndkey0,$inout4
|
||||||
aesenc $rndkey0,$inout5
|
aesenc $rndkey0,$inout5
|
||||||
$movkey ($key),$rndkey0
|
$movkey ($key),$rndkey0
|
||||||
|
dec $rounds
|
||||||
jnz .Lxts_enc_loop6
|
jnz .Lxts_enc_loop6
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa (%r8),$twmask
|
||||||
pxor $twtmp,$twtmp
|
movdqa $twres,$twtmp
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
paddd $twres,$twres
|
||||||
aesenc $rndkey1,$inout0
|
aesenc $rndkey1,$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
paddq @tweak[5],@tweak[5]
|
||||||
|
psrad \$31,$twtmp
|
||||||
aesenc $rndkey1,$inout1
|
aesenc $rndkey1,$inout1
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
|
pand $twmask,$twtmp
|
||||||
|
$movkey ($key_),@tweak[0] # load round[0]
|
||||||
aesenc $rndkey1,$inout2
|
aesenc $rndkey1,$inout2
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
aesenc $rndkey1,$inout3
|
aesenc $rndkey1,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
aesenc $rndkey1,$inout4
|
aesenc $rndkey1,$inout4
|
||||||
|
movaps @tweak[0],@tweak[1] # copy round[0]
|
||||||
aesenc $rndkey1,$inout5
|
aesenc $rndkey1,$inout5
|
||||||
$movkey 16($key),$rndkey1
|
$movkey 16($key),$rndkey1
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa $twres,$twtmp
|
||||||
pxor $twtmp,$twtmp
|
paddd $twres,$twres
|
||||||
movdqa @tweak[5],@tweak[0]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
aesenc $rndkey0,$inout0
|
aesenc $rndkey0,$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
pxor @tweak[5],@tweak[0]
|
||||||
|
psrad \$31,$twtmp
|
||||||
aesenc $rndkey0,$inout1
|
aesenc $rndkey0,$inout1
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twtmp
|
||||||
aesenc $rndkey0,$inout2
|
aesenc $rndkey0,$inout2
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
aesenc $rndkey0,$inout3
|
aesenc $rndkey0,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
aesenc $rndkey0,$inout4
|
aesenc $rndkey0,$inout4
|
||||||
|
movaps @tweak[1],@tweak[2]
|
||||||
aesenc $rndkey0,$inout5
|
aesenc $rndkey0,$inout5
|
||||||
$movkey 32($key),$rndkey0
|
$movkey 32($key),$rndkey0
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa $twres,$twtmp
|
||||||
pxor $twtmp,$twtmp
|
paddd $twres,$twres
|
||||||
movdqa @tweak[5],@tweak[1]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
aesenc $rndkey1,$inout0
|
aesenc $rndkey1,$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
pxor @tweak[5],@tweak[1]
|
||||||
|
psrad \$31,$twtmp
|
||||||
aesenc $rndkey1,$inout1
|
aesenc $rndkey1,$inout1
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twtmp
|
||||||
aesenc $rndkey1,$inout2
|
aesenc $rndkey1,$inout2
|
||||||
pxor $twres,@tweak[5]
|
movdqa @tweak[3],`16*3`(%rsp)
|
||||||
aesenc $rndkey1,$inout3
|
aesenc $rndkey1,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
|
aesenc $rndkey1,$inout4
|
||||||
|
movaps @tweak[2],@tweak[3]
|
||||||
|
aesenc $rndkey1,$inout5
|
||||||
|
$movkey 48($key),$rndkey1
|
||||||
|
|
||||||
|
movdqa $twres,$twtmp
|
||||||
|
paddd $twres,$twres
|
||||||
|
aesenc $rndkey0,$inout0
|
||||||
|
pxor @tweak[5],@tweak[2]
|
||||||
|
psrad \$31,$twtmp
|
||||||
|
aesenc $rndkey0,$inout1
|
||||||
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twtmp
|
||||||
|
aesenc $rndkey0,$inout2
|
||||||
|
aesenc $rndkey0,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
|
aesenc $rndkey0,$inout4
|
||||||
|
movaps @tweak[3],@tweak[4]
|
||||||
|
aesenc $rndkey0,$inout5
|
||||||
|
|
||||||
|
movdqa $twres,$rndkey0
|
||||||
|
paddd $twres,$twres
|
||||||
|
aesenc $rndkey1,$inout0
|
||||||
|
pxor @tweak[5],@tweak[3]
|
||||||
|
psrad \$31,$rndkey0
|
||||||
|
aesenc $rndkey1,$inout1
|
||||||
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$rndkey0
|
||||||
|
aesenc $rndkey1,$inout2
|
||||||
|
aesenc $rndkey1,$inout3
|
||||||
|
pxor $rndkey0,@tweak[5]
|
||||||
|
$movkey ($key_),$rndkey0
|
||||||
aesenc $rndkey1,$inout4
|
aesenc $rndkey1,$inout4
|
||||||
aesenc $rndkey1,$inout5
|
aesenc $rndkey1,$inout5
|
||||||
|
$movkey 16($key_),$rndkey1
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
pxor @tweak[5],@tweak[4]
|
||||||
pxor $twtmp,$twtmp
|
psrad \$31,$twres
|
||||||
movdqa @tweak[5],@tweak[2]
|
aesenclast `16*0`(%rsp),$inout0
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
paddq @tweak[5],@tweak[5]
|
||||||
aesenclast $rndkey0,$inout0
|
pand $twmask,$twres
|
||||||
pand $twmask,$twres # isolate carry and residue
|
aesenclast `16*1`(%rsp),$inout1
|
||||||
aesenclast $rndkey0,$inout1
|
aesenclast `16*2`(%rsp),$inout2
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
|
||||||
aesenclast $rndkey0,$inout2
|
|
||||||
pxor $twres,@tweak[5]
|
pxor $twres,@tweak[5]
|
||||||
aesenclast $rndkey0,$inout3
|
aesenclast `16*3`(%rsp),$inout3
|
||||||
aesenclast $rndkey0,$inout4
|
aesenclast `16*4`(%rsp),$inout4
|
||||||
aesenclast $rndkey0,$inout5
|
aesenclast `16*5`(%rsp),$inout5
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
pxor $twtmp,$twtmp
|
|
||||||
movdqa @tweak[5],@tweak[3]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
xorps `16*0`(%rsp),$inout0 # output^=tweak
|
|
||||||
pand $twmask,$twres # isolate carry and residue
|
|
||||||
xorps `16*1`(%rsp),$inout1
|
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
|
|
||||||
xorps `16*2`(%rsp),$inout2
|
|
||||||
movups $inout0,`16*0`($out) # write output
|
|
||||||
xorps `16*3`(%rsp),$inout3
|
|
||||||
movups $inout1,`16*1`($out)
|
|
||||||
xorps `16*4`(%rsp),$inout4
|
|
||||||
movups $inout2,`16*2`($out)
|
|
||||||
xorps `16*5`(%rsp),$inout5
|
|
||||||
movups $inout3,`16*3`($out)
|
|
||||||
mov $rnds_,$rounds # restore $rounds
|
mov $rnds_,$rounds # restore $rounds
|
||||||
movups $inout4,`16*4`($out)
|
|
||||||
movups $inout5,`16*5`($out)
|
|
||||||
lea `16*6`($out),$out
|
lea `16*6`($out),$out
|
||||||
|
movups $inout0,`-16*6`($out) # write output
|
||||||
|
movups $inout1,`-16*5`($out)
|
||||||
|
movups $inout2,`-16*4`($out)
|
||||||
|
movups $inout3,`-16*3`($out)
|
||||||
|
movups $inout4,`-16*2`($out)
|
||||||
|
movups $inout5,`-16*1`($out)
|
||||||
sub \$16*6,$len
|
sub \$16*6,$len
|
||||||
jnc .Lxts_enc_grandloop
|
jnc .Lxts_enc_grandloop
|
||||||
|
|
||||||
lea 3($rounds,$rounds),$rounds # restore original value
|
lea 7($rounds,$rounds),$rounds # restore original value
|
||||||
mov $key_,$key # restore $key
|
mov $key_,$key # restore $key
|
||||||
mov $rounds,$rnds_ # backup $rounds
|
mov $rounds,$rnds_ # backup $rounds
|
||||||
|
|
||||||
.Lxts_enc_short:
|
.Lxts_enc_short:
|
||||||
|
pxor $rndkey0,@tweak[0]
|
||||||
add \$16*6,$len
|
add \$16*6,$len
|
||||||
jz .Lxts_enc_done
|
jz .Lxts_enc_done
|
||||||
|
|
||||||
|
pxor $rndkey0,@tweak[1]
|
||||||
cmp \$0x20,$len
|
cmp \$0x20,$len
|
||||||
jb .Lxts_enc_one
|
jb .Lxts_enc_one
|
||||||
|
pxor $rndkey0,@tweak[2]
|
||||||
je .Lxts_enc_two
|
je .Lxts_enc_two
|
||||||
|
|
||||||
|
pxor $rndkey0,@tweak[3]
|
||||||
cmp \$0x40,$len
|
cmp \$0x40,$len
|
||||||
jb .Lxts_enc_three
|
jb .Lxts_enc_three
|
||||||
|
pxor $rndkey0,@tweak[4]
|
||||||
je .Lxts_enc_four
|
je .Lxts_enc_four
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
movdqa @tweak[5],@tweak[4]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
movdqu ($inp),$inout0
|
movdqu ($inp),$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
|
||||||
movdqu 16*1($inp),$inout1
|
movdqu 16*1($inp),$inout1
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
|
|
||||||
movdqu 16*2($inp),$inout2
|
movdqu 16*2($inp),$inout2
|
||||||
pxor @tweak[0],$inout0
|
pxor @tweak[0],$inout0
|
||||||
movdqu 16*3($inp),$inout3
|
movdqu 16*3($inp),$inout3
|
||||||
@ -1765,15 +1803,15 @@ $code.=<<___;
|
|||||||
|
|
||||||
call _aesni_encrypt4
|
call _aesni_encrypt4
|
||||||
|
|
||||||
xorps @tweak[0],$inout0
|
pxor @tweak[0],$inout0
|
||||||
movdqa @tweak[5],@tweak[0]
|
movdqa @tweak[4],@tweak[0]
|
||||||
xorps @tweak[1],$inout1
|
pxor @tweak[1],$inout1
|
||||||
xorps @tweak[2],$inout2
|
pxor @tweak[2],$inout2
|
||||||
movups $inout0,($out)
|
movdqu $inout0,($out)
|
||||||
xorps @tweak[3],$inout3
|
pxor @tweak[3],$inout3
|
||||||
movups $inout1,16*1($out)
|
movdqu $inout1,16*1($out)
|
||||||
movups $inout2,16*2($out)
|
movdqu $inout2,16*2($out)
|
||||||
movups $inout3,16*3($out)
|
movdqu $inout3,16*3($out)
|
||||||
lea 16*4($out),$out
|
lea 16*4($out),$out
|
||||||
jmp .Lxts_enc_done
|
jmp .Lxts_enc_done
|
||||||
|
|
||||||
@ -1865,213 +1903,248 @@ $code.=<<___;
|
|||||||
shl \$4,%rax
|
shl \$4,%rax
|
||||||
sub %rax,$len
|
sub %rax,$len
|
||||||
|
|
||||||
|
$movkey ($key),$rndkey0 # zero round key
|
||||||
mov $key,$key_ # backup $key
|
mov $key,$key_ # backup $key
|
||||||
mov $rnds_,$rounds # backup $rounds
|
mov $rnds_,$rounds # backup $rounds
|
||||||
|
shl \$4,$rnds_
|
||||||
mov $len,$len_ # backup $len
|
mov $len,$len_ # backup $len
|
||||||
and \$-16,$len
|
and \$-16,$len
|
||||||
|
|
||||||
|
$movkey 16($key,$rnds_),$rndkey1 # last round key
|
||||||
|
mov $rounds,$rnds_
|
||||||
|
|
||||||
movdqa .Lxts_magic(%rip),$twmask
|
movdqa .Lxts_magic(%rip),$twmask
|
||||||
pxor $twtmp,$twtmp
|
pshufd \$0x5f,@tweak[5],$twres
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
|
pxor $rndkey0,$rndkey1
|
||||||
___
|
___
|
||||||
for ($i=0;$i<4;$i++) {
|
for ($i=0;$i<4;$i++) {
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa $twres,$twtmp
|
||||||
pxor $twtmp,$twtmp
|
paddd $twres,$twres
|
||||||
movdqa @tweak[5],@tweak[$i]
|
movdqa @tweak[5],@tweak[$i]
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
psrad \$31,$twtmp # broadcast upper bits
|
||||||
pand $twmask,$twres # isolate carry and residue
|
paddq @tweak[5],@tweak[5]
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
pand $twmask,$twtmp
|
||||||
pxor $twres,@tweak[5]
|
pxor $rndkey0,@tweak[$i]
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
movdqa @tweak[5],@tweak[4]
|
||||||
|
psrad \$31,$twres
|
||||||
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twres
|
||||||
|
pxor $rndkey0,@tweak[4]
|
||||||
|
pxor $twres,@tweak[5]
|
||||||
|
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
|
||||||
|
|
||||||
sub \$16*6,$len
|
sub \$16*6,$len
|
||||||
jc .Lxts_dec_short
|
jc .Lxts_dec_short
|
||||||
|
|
||||||
shr \$1,$rounds
|
shr \$1,$rounds
|
||||||
sub \$1,$rounds
|
sub \$3,$rounds
|
||||||
|
$movkey 16($key_),$rndkey1
|
||||||
mov $rounds,$rnds_
|
mov $rounds,$rnds_
|
||||||
|
lea .Lxts_magic(%rip),%r8
|
||||||
jmp .Lxts_dec_grandloop
|
jmp .Lxts_dec_grandloop
|
||||||
|
|
||||||
.align 16
|
.align 32
|
||||||
.Lxts_dec_grandloop:
|
.Lxts_dec_grandloop:
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
movdqa @tweak[5],@tweak[4]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
movdqu `16*0`($inp),$inout0 # load input
|
movdqu `16*0`($inp),$inout0 # load input
|
||||||
pand $twmask,$twres # isolate carry and residue
|
movdqa $rndkey0,$twmask
|
||||||
movdqu `16*1`($inp),$inout1
|
movdqu `16*1`($inp),$inout1
|
||||||
pxor $twres,@tweak[5]
|
pxor @tweak[0],$inout0
|
||||||
|
|
||||||
movdqu `16*2`($inp),$inout2
|
movdqu `16*2`($inp),$inout2
|
||||||
pxor @tweak[0],$inout0 # input^=tweak
|
|
||||||
movdqu `16*3`($inp),$inout3
|
|
||||||
pxor @tweak[1],$inout1
|
pxor @tweak[1],$inout1
|
||||||
movdqu `16*4`($inp),$inout4
|
|
||||||
pxor @tweak[2],$inout2
|
|
||||||
movdqu `16*5`($inp),$inout5
|
|
||||||
lea `16*6`($inp),$inp
|
|
||||||
pxor @tweak[3],$inout3
|
|
||||||
$movkey ($key_),$rndkey0
|
|
||||||
pxor @tweak[4],$inout4
|
|
||||||
pxor @tweak[5],$inout5
|
|
||||||
|
|
||||||
# inline _aesni_decrypt6 and interleave first and last rounds
|
|
||||||
# with own code...
|
|
||||||
$movkey 16($key_),$rndkey1
|
|
||||||
pxor $rndkey0,$inout0
|
|
||||||
pxor $rndkey0,$inout1
|
|
||||||
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks
|
|
||||||
aesdec $rndkey1,$inout0
|
aesdec $rndkey1,$inout0
|
||||||
lea 32($key_),$key
|
movdqu `16*3`($inp),$inout3
|
||||||
pxor $rndkey0,$inout2
|
pxor @tweak[2],$inout2
|
||||||
movdqa @tweak[1],`16*1`(%rsp)
|
|
||||||
aesdec $rndkey1,$inout1
|
aesdec $rndkey1,$inout1
|
||||||
pxor $rndkey0,$inout3
|
movdqu `16*4`($inp),$inout4
|
||||||
movdqa @tweak[2],`16*2`(%rsp)
|
pxor @tweak[3],$inout3
|
||||||
aesdec $rndkey1,$inout2
|
aesdec $rndkey1,$inout2
|
||||||
pxor $rndkey0,$inout4
|
movdqu `16*5`($inp),$inout5
|
||||||
movdqa @tweak[3],`16*3`(%rsp)
|
pxor @tweak[5],$twmask # round[0]^=tweak[5]
|
||||||
|
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
|
||||||
|
pxor @tweak[4],$inout4
|
||||||
aesdec $rndkey1,$inout3
|
aesdec $rndkey1,$inout3
|
||||||
pxor $rndkey0,$inout5
|
$movkey 32($key_),$rndkey0
|
||||||
$movkey ($key),$rndkey0
|
lea `16*6`($inp),$inp
|
||||||
dec $rounds
|
pxor $twmask,$inout5
|
||||||
movdqa @tweak[4],`16*4`(%rsp)
|
|
||||||
aesdec $rndkey1,$inout4
|
|
||||||
movdqa @tweak[5],`16*5`(%rsp)
|
|
||||||
aesdec $rndkey1,$inout5
|
|
||||||
pxor $twtmp,$twtmp
|
|
||||||
pcmpgtd @tweak[5],$twtmp
|
|
||||||
jmp .Lxts_dec_loop6_enter
|
|
||||||
|
|
||||||
.align 16
|
pxor $twres,@tweak[0]
|
||||||
|
aesdec $rndkey1,$inout4
|
||||||
|
pxor $twres,@tweak[1]
|
||||||
|
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
|
||||||
|
aesdec $rndkey1,$inout5
|
||||||
|
$movkey 48($key_),$rndkey1
|
||||||
|
|
||||||
|
aesdec $rndkey0,$inout0
|
||||||
|
pxor $twres,@tweak[2]
|
||||||
|
movdqa @tweak[1],`16*1`(%rsp)
|
||||||
|
aesdec $rndkey0,$inout1
|
||||||
|
pxor $twres,@tweak[3]
|
||||||
|
movdqa @tweak[2],`16*2`(%rsp)
|
||||||
|
aesdec $rndkey0,$inout2
|
||||||
|
pxor $twres,@tweak[4]
|
||||||
|
aesdec $rndkey0,$inout3
|
||||||
|
pxor $twres,$twmask
|
||||||
|
movdqa @tweak[4],`16*4`(%rsp)
|
||||||
|
aesdec $rndkey0,$inout4
|
||||||
|
movdqa $twmask,`16*5`(%rsp)
|
||||||
|
aesdec $rndkey0,$inout5
|
||||||
|
$movkey 64($key_),$rndkey0
|
||||||
|
lea 64($key_),$key
|
||||||
|
pshufd \$0x5f,@tweak[5],$twres
|
||||||
|
jmp .Lxts_dec_loop6
|
||||||
|
.align 32
|
||||||
.Lxts_dec_loop6:
|
.Lxts_dec_loop6:
|
||||||
aesdec $rndkey1,$inout0
|
aesdec $rndkey1,$inout0
|
||||||
aesdec $rndkey1,$inout1
|
aesdec $rndkey1,$inout1
|
||||||
dec $rounds
|
|
||||||
aesdec $rndkey1,$inout2
|
aesdec $rndkey1,$inout2
|
||||||
aesdec $rndkey1,$inout3
|
aesdec $rndkey1,$inout3
|
||||||
aesdec $rndkey1,$inout4
|
aesdec $rndkey1,$inout4
|
||||||
aesdec $rndkey1,$inout5
|
aesdec $rndkey1,$inout5
|
||||||
.Lxts_dec_loop6_enter:
|
|
||||||
$movkey 16($key),$rndkey1
|
$movkey 16($key),$rndkey1
|
||||||
|
lea 32($key),$key
|
||||||
|
|
||||||
aesdec $rndkey0,$inout0
|
aesdec $rndkey0,$inout0
|
||||||
aesdec $rndkey0,$inout1
|
aesdec $rndkey0,$inout1
|
||||||
lea 32($key),$key
|
|
||||||
aesdec $rndkey0,$inout2
|
aesdec $rndkey0,$inout2
|
||||||
aesdec $rndkey0,$inout3
|
aesdec $rndkey0,$inout3
|
||||||
aesdec $rndkey0,$inout4
|
aesdec $rndkey0,$inout4
|
||||||
aesdec $rndkey0,$inout5
|
aesdec $rndkey0,$inout5
|
||||||
$movkey ($key),$rndkey0
|
$movkey ($key),$rndkey0
|
||||||
|
dec $rounds
|
||||||
jnz .Lxts_dec_loop6
|
jnz .Lxts_dec_loop6
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa (%r8),$twmask
|
||||||
pxor $twtmp,$twtmp
|
movdqa $twres,$twtmp
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
paddd $twres,$twres
|
||||||
aesdec $rndkey1,$inout0
|
aesdec $rndkey1,$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
paddq @tweak[5],@tweak[5]
|
||||||
|
psrad \$31,$twtmp
|
||||||
aesdec $rndkey1,$inout1
|
aesdec $rndkey1,$inout1
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcast upper bits
|
pand $twmask,$twtmp
|
||||||
|
$movkey ($key_),@tweak[0] # load round[0]
|
||||||
aesdec $rndkey1,$inout2
|
aesdec $rndkey1,$inout2
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
aesdec $rndkey1,$inout3
|
aesdec $rndkey1,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
aesdec $rndkey1,$inout4
|
aesdec $rndkey1,$inout4
|
||||||
|
movaps @tweak[0],@tweak[1] # copy round[0]
|
||||||
aesdec $rndkey1,$inout5
|
aesdec $rndkey1,$inout5
|
||||||
$movkey 16($key),$rndkey1
|
$movkey 16($key),$rndkey1
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa $twres,$twtmp
|
||||||
pxor $twtmp,$twtmp
|
paddd $twres,$twres
|
||||||
movdqa @tweak[5],@tweak[0]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
aesdec $rndkey0,$inout0
|
aesdec $rndkey0,$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
pxor @tweak[5],@tweak[0]
|
||||||
|
psrad \$31,$twtmp
|
||||||
aesdec $rndkey0,$inout1
|
aesdec $rndkey0,$inout1
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twtmp
|
||||||
aesdec $rndkey0,$inout2
|
aesdec $rndkey0,$inout2
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
aesdec $rndkey0,$inout3
|
aesdec $rndkey0,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
aesdec $rndkey0,$inout4
|
aesdec $rndkey0,$inout4
|
||||||
|
movaps @tweak[1],@tweak[2]
|
||||||
aesdec $rndkey0,$inout5
|
aesdec $rndkey0,$inout5
|
||||||
$movkey 32($key),$rndkey0
|
$movkey 32($key),$rndkey0
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
movdqa $twres,$twtmp
|
||||||
pxor $twtmp,$twtmp
|
paddd $twres,$twres
|
||||||
movdqa @tweak[5],@tweak[1]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
aesdec $rndkey1,$inout0
|
aesdec $rndkey1,$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
pxor @tweak[5],@tweak[1]
|
||||||
|
psrad \$31,$twtmp
|
||||||
aesdec $rndkey1,$inout1
|
aesdec $rndkey1,$inout1
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twtmp
|
||||||
aesdec $rndkey1,$inout2
|
aesdec $rndkey1,$inout2
|
||||||
pxor $twres,@tweak[5]
|
movdqa @tweak[3],`16*3`(%rsp)
|
||||||
aesdec $rndkey1,$inout3
|
aesdec $rndkey1,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
|
aesdec $rndkey1,$inout4
|
||||||
|
movaps @tweak[2],@tweak[3]
|
||||||
|
aesdec $rndkey1,$inout5
|
||||||
|
$movkey 48($key),$rndkey1
|
||||||
|
|
||||||
|
movdqa $twres,$twtmp
|
||||||
|
paddd $twres,$twres
|
||||||
|
aesdec $rndkey0,$inout0
|
||||||
|
pxor @tweak[5],@tweak[2]
|
||||||
|
psrad \$31,$twtmp
|
||||||
|
aesdec $rndkey0,$inout1
|
||||||
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$twtmp
|
||||||
|
aesdec $rndkey0,$inout2
|
||||||
|
aesdec $rndkey0,$inout3
|
||||||
|
pxor $twtmp,@tweak[5]
|
||||||
|
aesdec $rndkey0,$inout4
|
||||||
|
movaps @tweak[3],@tweak[4]
|
||||||
|
aesdec $rndkey0,$inout5
|
||||||
|
|
||||||
|
movdqa $twres,$rndkey0
|
||||||
|
paddd $twres,$twres
|
||||||
|
aesdec $rndkey1,$inout0
|
||||||
|
pxor @tweak[5],@tweak[3]
|
||||||
|
psrad \$31,$rndkey0
|
||||||
|
aesdec $rndkey1,$inout1
|
||||||
|
paddq @tweak[5],@tweak[5]
|
||||||
|
pand $twmask,$rndkey0
|
||||||
|
aesdec $rndkey1,$inout2
|
||||||
|
aesdec $rndkey1,$inout3
|
||||||
|
pxor $rndkey0,@tweak[5]
|
||||||
|
$movkey ($key_),$rndkey0
|
||||||
aesdec $rndkey1,$inout4
|
aesdec $rndkey1,$inout4
|
||||||
aesdec $rndkey1,$inout5
|
aesdec $rndkey1,$inout5
|
||||||
|
$movkey 16($key_),$rndkey1
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
pxor @tweak[5],@tweak[4]
|
||||||
pxor $twtmp,$twtmp
|
psrad \$31,$twres
|
||||||
movdqa @tweak[5],@tweak[2]
|
aesdeclast `16*0`(%rsp),$inout0
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
paddq @tweak[5],@tweak[5]
|
||||||
aesdeclast $rndkey0,$inout0
|
pand $twmask,$twres
|
||||||
pand $twmask,$twres # isolate carry and residue
|
aesdeclast `16*1`(%rsp),$inout1
|
||||||
aesdeclast $rndkey0,$inout1
|
aesdeclast `16*2`(%rsp),$inout2
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
|
||||||
aesdeclast $rndkey0,$inout2
|
|
||||||
pxor $twres,@tweak[5]
|
pxor $twres,@tweak[5]
|
||||||
aesdeclast $rndkey0,$inout3
|
aesdeclast `16*3`(%rsp),$inout3
|
||||||
aesdeclast $rndkey0,$inout4
|
aesdeclast `16*4`(%rsp),$inout4
|
||||||
aesdeclast $rndkey0,$inout5
|
aesdeclast `16*5`(%rsp),$inout5
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
pxor $twtmp,$twtmp
|
|
||||||
movdqa @tweak[5],@tweak[3]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
xorps `16*0`(%rsp),$inout0 # output^=tweak
|
|
||||||
pand $twmask,$twres # isolate carry and residue
|
|
||||||
xorps `16*1`(%rsp),$inout1
|
|
||||||
pcmpgtd @tweak[5],$twtmp # broadcat upper bits
|
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
|
|
||||||
xorps `16*2`(%rsp),$inout2
|
|
||||||
movups $inout0,`16*0`($out) # write output
|
|
||||||
xorps `16*3`(%rsp),$inout3
|
|
||||||
movups $inout1,`16*1`($out)
|
|
||||||
xorps `16*4`(%rsp),$inout4
|
|
||||||
movups $inout2,`16*2`($out)
|
|
||||||
xorps `16*5`(%rsp),$inout5
|
|
||||||
movups $inout3,`16*3`($out)
|
|
||||||
mov $rnds_,$rounds # restore $rounds
|
mov $rnds_,$rounds # restore $rounds
|
||||||
movups $inout4,`16*4`($out)
|
|
||||||
movups $inout5,`16*5`($out)
|
|
||||||
lea `16*6`($out),$out
|
lea `16*6`($out),$out
|
||||||
|
movups $inout0,`-16*6`($out) # write output
|
||||||
|
movups $inout1,`-16*5`($out)
|
||||||
|
movups $inout2,`-16*4`($out)
|
||||||
|
movups $inout3,`-16*3`($out)
|
||||||
|
movups $inout4,`-16*2`($out)
|
||||||
|
movups $inout5,`-16*1`($out)
|
||||||
sub \$16*6,$len
|
sub \$16*6,$len
|
||||||
jnc .Lxts_dec_grandloop
|
jnc .Lxts_dec_grandloop
|
||||||
|
|
||||||
lea 3($rounds,$rounds),$rounds # restore original value
|
lea 7($rounds,$rounds),$rounds # restore original value
|
||||||
mov $key_,$key # restore $key
|
mov $key_,$key # restore $key
|
||||||
mov $rounds,$rnds_ # backup $rounds
|
mov $rounds,$rnds_ # backup $rounds
|
||||||
|
|
||||||
.Lxts_dec_short:
|
.Lxts_dec_short:
|
||||||
|
pxor $rndkey0,@tweak[0]
|
||||||
|
pxor $rndkey0,@tweak[1]
|
||||||
add \$16*6,$len
|
add \$16*6,$len
|
||||||
jz .Lxts_dec_done
|
jz .Lxts_dec_done
|
||||||
|
|
||||||
|
pxor $rndkey0,@tweak[2]
|
||||||
cmp \$0x20,$len
|
cmp \$0x20,$len
|
||||||
jb .Lxts_dec_one
|
jb .Lxts_dec_one
|
||||||
|
pxor $rndkey0,@tweak[3]
|
||||||
je .Lxts_dec_two
|
je .Lxts_dec_two
|
||||||
|
|
||||||
|
pxor $rndkey0,@tweak[4]
|
||||||
cmp \$0x40,$len
|
cmp \$0x40,$len
|
||||||
jb .Lxts_dec_three
|
jb .Lxts_dec_three
|
||||||
je .Lxts_dec_four
|
je .Lxts_dec_four
|
||||||
|
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
movdqa @tweak[5],@tweak[4]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
movdqu ($inp),$inout0
|
movdqu ($inp),$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
|
||||||
movdqu 16*1($inp),$inout1
|
movdqu 16*1($inp),$inout1
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
|
|
||||||
movdqu 16*2($inp),$inout2
|
movdqu 16*2($inp),$inout2
|
||||||
pxor @tweak[0],$inout0
|
pxor @tweak[0],$inout0
|
||||||
movdqu 16*3($inp),$inout3
|
movdqu 16*3($inp),$inout3
|
||||||
@ -2156,7 +2229,7 @@ $code.=<<___;
|
|||||||
xorps @tweak[0],$inout0
|
xorps @tweak[0],$inout0
|
||||||
movdqa @tweak[3],@tweak[0]
|
movdqa @tweak[3],@tweak[0]
|
||||||
xorps @tweak[1],$inout1
|
xorps @tweak[1],$inout1
|
||||||
movdqa @tweak[5],@tweak[1]
|
movdqa @tweak[4],@tweak[1]
|
||||||
xorps @tweak[2],$inout2
|
xorps @tweak[2],$inout2
|
||||||
movups $inout0,($out)
|
movups $inout0,($out)
|
||||||
movups $inout1,16*1($out)
|
movups $inout1,16*1($out)
|
||||||
@ -2166,14 +2239,8 @@ $code.=<<___;
|
|||||||
|
|
||||||
.align 16
|
.align 16
|
||||||
.Lxts_dec_four:
|
.Lxts_dec_four:
|
||||||
pshufd \$0x13,$twtmp,$twres
|
|
||||||
movdqa @tweak[5],@tweak[4]
|
|
||||||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
|
||||||
movups ($inp),$inout0
|
movups ($inp),$inout0
|
||||||
pand $twmask,$twres # isolate carry and residue
|
|
||||||
movups 16*1($inp),$inout1
|
movups 16*1($inp),$inout1
|
||||||
pxor $twres,@tweak[5]
|
|
||||||
|
|
||||||
movups 16*2($inp),$inout2
|
movups 16*2($inp),$inout2
|
||||||
xorps @tweak[0],$inout0
|
xorps @tweak[0],$inout0
|
||||||
movups 16*3($inp),$inout3
|
movups 16*3($inp),$inout3
|
||||||
@ -2184,16 +2251,16 @@ $code.=<<___;
|
|||||||
|
|
||||||
call _aesni_decrypt4
|
call _aesni_decrypt4
|
||||||
|
|
||||||
xorps @tweak[0],$inout0
|
pxor @tweak[0],$inout0
|
||||||
movdqa @tweak[4],@tweak[0]
|
movdqa @tweak[4],@tweak[0]
|
||||||
xorps @tweak[1],$inout1
|
pxor @tweak[1],$inout1
|
||||||
movdqa @tweak[5],@tweak[1]
|
movdqa @tweak[5],@tweak[1]
|
||||||
xorps @tweak[2],$inout2
|
pxor @tweak[2],$inout2
|
||||||
movups $inout0,($out)
|
movdqu $inout0,($out)
|
||||||
xorps @tweak[3],$inout3
|
pxor @tweak[3],$inout3
|
||||||
movups $inout1,16*1($out)
|
movdqu $inout1,16*1($out)
|
||||||
movups $inout2,16*2($out)
|
movdqu $inout2,16*2($out)
|
||||||
movups $inout3,16*3($out)
|
movdqu $inout3,16*3($out)
|
||||||
lea 16*4($out),$out
|
lea 16*4($out),$out
|
||||||
jmp .Lxts_dec_done
|
jmp .Lxts_dec_done
|
||||||
|
|
||||||
@ -3240,6 +3307,19 @@ sub aesni {
|
|||||||
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
||||||
return ".byte\t".join(',',@opcode);
|
return ".byte\t".join(',',@opcode);
|
||||||
}
|
}
|
||||||
|
elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
|
||||||
|
my %opcodelet = (
|
||||||
|
"aesenc" => 0xdc, "aesenclast" => 0xdd,
|
||||||
|
"aesdec" => 0xde, "aesdeclast" => 0xdf
|
||||||
|
);
|
||||||
|
return undef if (!defined($opcodelet{$1}));
|
||||||
|
my $off = $2;
|
||||||
|
push @opcode,0x44 if ($3>=8);
|
||||||
|
push @opcode,0x0f,0x38,$opcodelet{$1};
|
||||||
|
push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
|
||||||
|
push @opcode,($off=~/^0/?oct($off):$off)&0xff;
|
||||||
|
return ".byte\t".join(',',@opcode);
|
||||||
|
}
|
||||||
return $line;
|
return $line;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user