openssl/crypto/aes/asm/aesni-x86_64.pl
Andy Polyakov 36df342f9b aesni-x86_64.pl: optimize XTS.
PR: 3042
2013-05-25 19:23:09 +02:00

3332 lines
81 KiB
Perl
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for Intel AES-NI extension. In
# OpenSSL context it's used with Intel engine, but can also be used as
# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
# details].
#
# Performance.
#
# Given aes(enc|dec) instructions' latency asymptotic performance for
# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
# processed with 128-bit key. And given their throughput asymptotic
# performance for parallelizable modes is 1.25 cycles per byte. Being
# asymptotic limit it's not something you commonly achieve in reality,
# but how close does one get? Below are results collected for
# different modes and block sized. Pairs of numbers are for en-/
# decryption.
#
# 16-byte 64-byte 256-byte 1-KB 8-KB
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
#
# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
# The results were collected with specially crafted speed.c benchmark
# in order to compare them with results reported in "Intel Advanced
# Encryption Standard (AES) New Instruction Set" White Paper Revision
# 3.0 dated May 2010. All above results are consistently better. This
# module also provides better performance for block sizes smaller than
# 128 bytes in points *not* represented in the above table.
#
# Looking at the results for 8-KB buffer.
#
# CFB and OFB results are far from the limit, because implementation
# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
# single-block aesni_encrypt, which is not the most optimal way to go.
# CBC encrypt result is unexpectedly high and there is no documented
# explanation for it. Seemingly there is a small penalty for feeding
# the result back to AES unit the way it's done in CBC mode. There is
# nothing one can do and the result appears optimal. CCM result is
# identical to CBC, because CBC-MAC is essentially CBC encrypt without
# saving output. CCM CTR "stays invisible," because it's neatly
# interleaved wih CBC-MAC. This provides ~30% improvement over
# "straghtforward" CCM implementation with CTR and CBC-MAC performed
# disjointly. Parallelizable modes practically achieve the theoretical
# limit.
#
# Looking at how results vary with buffer size.
#
# Curves are practically saturated at 1-KB buffer size. In most cases
# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
# CTR curve doesn't follow this pattern and is "slowest" changing one
# with "256-byte" result being 87% of "8-KB." This is because overhead
# in CTR mode is most computationally intensive. Small-block CCM
# decrypt is slower than encrypt, because first CTR and last CBC-MAC
# iterations can't be interleaved.
#
# Results for 192- and 256-bit keys.
#
# EVP-free results were observed to scale perfectly with number of
# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
# are a tad smaller, because the above mentioned penalty biases all
# results by same constant value. In similar way function call
# overhead affects small-block performance, as well as OFB and CFB
# results. Differences are not large, most common coefficients are
# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
# January 2011
#
# While Westmere processor features 6 cycles latency for aes[enc|dec]
# instructions, which can be scheduled every second cycle, Sandy
# Bridge spends 8 cycles per instruction, but it can schedule them
# every cycle. This means that code targeting Westmere would perform
# suboptimally on Sandy Bridge. Therefore this update.
#
# In addition, non-parallelizable CBC encrypt (as well as CCM) is
# optimized. Relative improvement might appear modest, 8% on Westmere,
# but in absolute terms it's 3.77 cycles per byte encrypted with
# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
# should be compared to asymptotic limits of 3.75 for Westmere and
# 5.00 for Sandy Bridge. Actually, the fact that they get this close
# to asymptotic limits is quite amazing. Indeed, the limit is
# calculated as latency times number of rounds, 10 for 128-bit key,
# and divided by 16, the number of bytes in block, or in other words
# it accounts *solely* for aesenc instructions. But there are extra
# instructions, and numbers so close to the asymptotic limits mean
# that it's as if it takes as little as *one* additional cycle to
# execute all of them. How is it possible? It is possible thanks to
# out-of-order execution logic, which manages to overlap post-
# processing of previous block, things like saving the output, with
# actual encryption of current block, as well as pre-processing of
# current block, things like fetching input and xor-ing it with
# 0-round element of the key schedule, with actual encryption of
# previous block. Keep this in mind...
#
# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
# performance is achieved by interleaving instructions working on
# independent blocks. In which case asymptotic limit for such modes
# can be obtained by dividing above mentioned numbers by AES
# instructions' interleave factor. Westmere can execute at most 3
# instructions at a time, meaning that optimal interleave factor is 3,
# and that's where the "magic" number of 1.25 come from. "Optimal
# interleave factor" means that increase of interleave factor does
# not improve performance. The formula has proven to reflect reality
# pretty well on Westmere... Sandy Bridge on the other hand can
# execute up to 8 AES instructions at a time, so how does varying
# interleave factor affect the performance? Here is table for ECB
# (numbers are cycles per byte processed with 128-bit key):
#
# instruction interleave factor 3x 6x 8x
# theoretical asymptotic limit 1.67 0.83 0.625
# measured performance for 8KB block 1.05 0.86 0.84
#
# "as if" interleave factor 4.7x 5.8x 6.0x
#
# Further data for other parallelizable modes:
#
# CBC decrypt 1.16 0.93 0.74
# CTR 1.14 0.91 0.74
#
# Well, given 3x column it's probably inappropriate to call the limit
# asymptotic, if it can be surpassed, isn't it? What happens there?
# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
# magic is responsible for this. Processor overlaps not only the
# additional instructions with AES ones, but even AES instuctions
# processing adjacent triplets of independent blocks. In the 6x case
# additional instructions still claim disproportionally small amount
# of additional cycles, but in 8x case number of instructions must be
# a tad too high for out-of-order logic to cope with, and AES unit
# remains underutilized... As you can see 8x interleave is hardly
# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
# utilizies 6x interleave because of limited register bank capacity.
#
# Higher interleave factors do have negative impact on Westmere
# performance. While for ECB mode it's negligible ~1.5%, other
# parallelizables perform ~5% worse, which is outweighed by ~25%
# improvement on Sandy Bridge. To balance regression on Westmere
# CTR mode was implemented with 6x aesenc interleave factor.
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
# in CTR mode AES instruction interleave factor was chosen to be 6x.
######################################################################
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
# instruction latency is 9 cycles and that they can be issued every
# cycle.
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-x86_64.pl:-)
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
("%rdi","%rsi","%rdx","%rcx"); # Unix order
$code=".text\n";
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
$inp="%rdi";
$out="%rsi";
$len="%rdx";
$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
$ivp="%r8"; # cbc, ctr, ...
$rnds_="%r10d"; # backup copy for $rounds
$key_="%r11"; # backup copy for $key
# %xmm register layout
$rndkey0="%xmm0"; $rndkey1="%xmm1";
$inout0="%xmm2"; $inout1="%xmm3";
$inout2="%xmm4"; $inout3="%xmm5";
$inout4="%xmm6"; $inout5="%xmm7";
$inout6="%xmm8"; $inout7="%xmm9";
$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
$in0="%xmm8"; $iv="%xmm9";
# Inline version of internal aesni_[en|de]crypt1.
#
# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
# cycles which take care of loop variables...
{ my $sn;
sub aesni_generate1 {
my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
++$sn;
$code.=<<___;
$movkey ($key),$rndkey0
$movkey 16($key),$rndkey1
___
$code.=<<___ if (defined($ivec));
xorps $rndkey0,$ivec
lea 32($key),$key
xorps $ivec,$inout
___
$code.=<<___ if (!defined($ivec));
lea 32($key),$key
xorps $rndkey0,$inout
___
$code.=<<___;
.Loop_${p}1_$sn:
aes${p} $rndkey1,$inout
dec $rounds
$movkey ($key),$rndkey1
lea 16($key),$key
jnz .Loop_${p}1_$sn # loop body is 16 bytes
aes${p}last $rndkey1,$inout
___
}}
# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
#
{ my ($inp,$out,$key) = @_4args;
$code.=<<___;
.globl ${PREFIX}_encrypt
.type ${PREFIX}_encrypt,\@abi-omnipotent
.align 16
${PREFIX}_encrypt:
movups ($inp),$inout0 # load input
mov 240($key),$rounds # key->rounds
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
movups $inout0,($out) # output
ret
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
.globl ${PREFIX}_decrypt
.type ${PREFIX}_decrypt,\@abi-omnipotent
.align 16
${PREFIX}_decrypt:
movups ($inp),$inout0 # load input
mov 240($key),$rounds # key->rounds
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
movups $inout0,($out) # output
ret
.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
___
}
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it makes no sense to implement 2x subroutine.
# aes[enc|dec] latency in next processor generation is 8, but the
# instructions can be scheduled every cycle. Optimal interleave for
# new processor is therefore 8x...
sub aesni_generate3 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-2] is cipher/clear text...
$code.=<<___;
.type _aesni_${dir}rypt3,\@abi-omnipotent
.align 16
_aesni_${dir}rypt3:
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
$movkey ($key),$rndkey0
.L${dir}_loop3:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
dec $rounds
aes${dir} $rndkey1,$inout2
$movkey 16($key),$rndkey1
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
lea 32($key),$key
aes${dir} $rndkey0,$inout2
$movkey ($key),$rndkey0
jnz .L${dir}_loop3
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
ret
.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
___
}
# 4x interleave is implemented to improve small block performance,
# most notably [and naturally] 4 block by ~30%. One can argue that one
# should have implemented 5x as well, but improvement would be <20%,
# so it's not worth it...
sub aesni_generate4 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-3] is cipher/clear text...
$code.=<<___;
.type _aesni_${dir}rypt4,\@abi-omnipotent
.align 16
_aesni_${dir}rypt4:
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
xorps $rndkey0,$inout2
xorps $rndkey0,$inout3
$movkey ($key),$rndkey0
.L${dir}_loop4:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
$movkey 16($key),$rndkey1
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
$movkey ($key),$rndkey0
jnz .L${dir}_loop4
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
aes${dir}last $rndkey0,$inout3
ret
.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
___
}
sub aesni_generate6 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-5] is cipher/clear text...
$code.=<<___;
.type _aesni_${dir}rypt6,\@abi-omnipotent
.align 16
_aesni_${dir}rypt6:
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
xorps $rndkey0,$inout0
pxor $rndkey0,$inout1
aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
aes${dir} $rndkey1,$inout3
pxor $rndkey0,$inout5
dec $rounds
aes${dir} $rndkey1,$inout4
$movkey ($key),$rndkey0
aes${dir} $rndkey1,$inout5
jmp .L${dir}_loop6_enter
.align 16
.L${dir}_loop6:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
.L${dir}_loop6_enter: # happens to be 16-byte aligned
$movkey 16($key),$rndkey1
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
$movkey ($key),$rndkey0
jnz .L${dir}_loop6
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
aes${dir}last $rndkey0,$inout3
aes${dir}last $rndkey0,$inout4
aes${dir}last $rndkey0,$inout5
ret
.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
___
}
sub aesni_generate8 {
my $dir=shift;
# As already mentioned it takes in $key and $rounds, which are *not*
# preserved. $inout[0-7] is cipher/clear text...
$code.=<<___;
.type _aesni_${dir}rypt8,\@abi-omnipotent
.align 16
_aesni_${dir}rypt8:
$movkey ($key),$rndkey0
shr \$1,$rounds
$movkey 16($key),$rndkey1
lea 32($key),$key
xorps $rndkey0,$inout0
xorps $rndkey0,$inout1
aes${dir} $rndkey1,$inout0
pxor $rndkey0,$inout2
aes${dir} $rndkey1,$inout1
pxor $rndkey0,$inout3
aes${dir} $rndkey1,$inout2
pxor $rndkey0,$inout4
aes${dir} $rndkey1,$inout3
pxor $rndkey0,$inout5
dec $rounds
aes${dir} $rndkey1,$inout4
pxor $rndkey0,$inout6
aes${dir} $rndkey1,$inout5
pxor $rndkey0,$inout7
$movkey ($key),$rndkey0
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
$movkey 16($key),$rndkey1
jmp .L${dir}_loop8_enter
.align 16
.L${dir}_loop8:
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
dec $rounds
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
$movkey 16($key),$rndkey1
.L${dir}_loop8_enter: # happens to be 16-byte aligned
aes${dir} $rndkey0,$inout0
aes${dir} $rndkey0,$inout1
lea 32($key),$key
aes${dir} $rndkey0,$inout2
aes${dir} $rndkey0,$inout3
aes${dir} $rndkey0,$inout4
aes${dir} $rndkey0,$inout5
aes${dir} $rndkey0,$inout6
aes${dir} $rndkey0,$inout7
$movkey ($key),$rndkey0
jnz .L${dir}_loop8
aes${dir} $rndkey1,$inout0
aes${dir} $rndkey1,$inout1
aes${dir} $rndkey1,$inout2
aes${dir} $rndkey1,$inout3
aes${dir} $rndkey1,$inout4
aes${dir} $rndkey1,$inout5
aes${dir} $rndkey1,$inout6
aes${dir} $rndkey1,$inout7
aes${dir}last $rndkey0,$inout0
aes${dir}last $rndkey0,$inout1
aes${dir}last $rndkey0,$inout2
aes${dir}last $rndkey0,$inout3
aes${dir}last $rndkey0,$inout4
aes${dir}last $rndkey0,$inout5
aes${dir}last $rndkey0,$inout6
aes${dir}last $rndkey0,$inout7
ret
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
___
}
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
&aesni_generate6("enc") if ($PREFIX eq "aesni");
&aesni_generate6("dec");
&aesni_generate8("enc") if ($PREFIX eq "aesni");
&aesni_generate8("dec");
if ($PREFIX eq "aesni") {
########################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
# size_t length, const AES_KEY *key,
# int enc);
$code.=<<___;
.globl aesni_ecb_encrypt
.type aesni_ecb_encrypt,\@function,5
.align 16
aesni_ecb_encrypt:
and \$-16,$len
jz .Lecb_ret
mov 240($key),$rounds # key->rounds
$movkey ($key),$rndkey0
mov $key,$key_ # backup $key
mov $rounds,$rnds_ # backup $rounds
test %r8d,%r8d # 5th argument
jz .Lecb_decrypt
#--------------------------- ECB ENCRYPT ------------------------------#
cmp \$0x80,$len
jb .Lecb_enc_tail
movdqu ($inp),$inout0
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
movdqu 0x40($inp),$inout4
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
sub \$0x80,$len
jmp .Lecb_enc_loop8_enter
.align 16
.Lecb_enc_loop8:
movups $inout0,($out)
mov $key_,$key # restore $key
movdqu ($inp),$inout0
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
movups $inout2,0x20($out)
movdqu 0x20($inp),$inout2
movups $inout3,0x30($out)
movdqu 0x30($inp),$inout3
movups $inout4,0x40($out)
movdqu 0x40($inp),$inout4
movups $inout5,0x50($out)
movdqu 0x50($inp),$inout5
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
lea 0x80($out),$out
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
.Lecb_enc_loop8_enter:
call _aesni_encrypt8
sub \$0x80,$len
jnc .Lecb_enc_loop8
movups $inout0,($out)
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
lea 0x80($out),$out
add \$0x80,$len
jz .Lecb_ret
.Lecb_enc_tail:
movups ($inp),$inout0
cmp \$0x20,$len
jb .Lecb_enc_one
movups 0x10($inp),$inout1
je .Lecb_enc_two
movups 0x20($inp),$inout2
cmp \$0x40,$len
jb .Lecb_enc_three
movups 0x30($inp),$inout3
je .Lecb_enc_four
movups 0x40($inp),$inout4
cmp \$0x60,$len
jb .Lecb_enc_five
movups 0x50($inp),$inout5
je .Lecb_enc_six
movdqu 0x60($inp),$inout6
call _aesni_encrypt8
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_one:
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
movups $inout0,($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_two:
xorps $inout2,$inout2
call _aesni_encrypt3
movups $inout0,($out)
movups $inout1,0x10($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_three:
call _aesni_encrypt3
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_four:
call _aesni_encrypt4
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_five:
xorps $inout5,$inout5
call _aesni_encrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
jmp .Lecb_ret
.align 16
.Lecb_enc_six:
call _aesni_encrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
jmp .Lecb_ret
#--------------------------- ECB DECRYPT ------------------------------#
.align 16
.Lecb_decrypt:
cmp \$0x80,$len
jb .Lecb_dec_tail
movdqu ($inp),$inout0
movdqu 0x10($inp),$inout1
movdqu 0x20($inp),$inout2
movdqu 0x30($inp),$inout3
movdqu 0x40($inp),$inout4
movdqu 0x50($inp),$inout5
movdqu 0x60($inp),$inout6
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
sub \$0x80,$len
jmp .Lecb_dec_loop8_enter
.align 16
.Lecb_dec_loop8:
movups $inout0,($out)
mov $key_,$key # restore $key
movdqu ($inp),$inout0
mov $rnds_,$rounds # restore $rounds
movups $inout1,0x10($out)
movdqu 0x10($inp),$inout1
movups $inout2,0x20($out)
movdqu 0x20($inp),$inout2
movups $inout3,0x30($out)
movdqu 0x30($inp),$inout3
movups $inout4,0x40($out)
movdqu 0x40($inp),$inout4
movups $inout5,0x50($out)
movdqu 0x50($inp),$inout5
movups $inout6,0x60($out)
movdqu 0x60($inp),$inout6
movups $inout7,0x70($out)
lea 0x80($out),$out
movdqu 0x70($inp),$inout7
lea 0x80($inp),$inp
.Lecb_dec_loop8_enter:
call _aesni_decrypt8
$movkey ($key_),$rndkey0
sub \$0x80,$len
jnc .Lecb_dec_loop8
movups $inout0,($out)
mov $key_,$key # restore $key
movups $inout1,0x10($out)
mov $rnds_,$rounds # restore $rounds
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
movups $inout7,0x70($out)
lea 0x80($out),$out
add \$0x80,$len
jz .Lecb_ret
.Lecb_dec_tail:
movups ($inp),$inout0
cmp \$0x20,$len
jb .Lecb_dec_one
movups 0x10($inp),$inout1
je .Lecb_dec_two
movups 0x20($inp),$inout2
cmp \$0x40,$len
jb .Lecb_dec_three
movups 0x30($inp),$inout3
je .Lecb_dec_four
movups 0x40($inp),$inout4
cmp \$0x60,$len
jb .Lecb_dec_five
movups 0x50($inp),$inout5
je .Lecb_dec_six
movups 0x60($inp),$inout6
$movkey ($key),$rndkey0
call _aesni_decrypt8
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
movups $inout6,0x60($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_one:
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
movups $inout0,($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_two:
xorps $inout2,$inout2
call _aesni_decrypt3
movups $inout0,($out)
movups $inout1,0x10($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_three:
call _aesni_decrypt3
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_four:
call _aesni_decrypt4
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_five:
xorps $inout5,$inout5
call _aesni_decrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
jmp .Lecb_ret
.align 16
.Lecb_dec_six:
call _aesni_decrypt6
movups $inout0,($out)
movups $inout1,0x10($out)
movups $inout2,0x20($out)
movups $inout3,0x30($out)
movups $inout4,0x40($out)
movups $inout5,0x50($out)
.Lecb_ret:
ret
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
___
{
######################################################################
# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec,char *cmac);
#
# Handles only complete blocks, operates on 64-bit counter and
# does not update *ivec! Nor does it finalize CMAC value
# (see engine/eng_aesni.c for details)
#
{
my $cmac="%r9"; # 6th argument
my $increment="%xmm6";
my $bswap_mask="%xmm7";
$code.=<<___;
.globl aesni_ccm64_encrypt_blocks
.type aesni_ccm64_encrypt_blocks,\@function,6
.align 16
aesni_ccm64_encrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
.Lccm64_enc_body:
___
$code.=<<___;
mov 240($key),$rounds # key->rounds
movdqu ($ivp),$iv
movdqa .Lincrement64(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask
shr \$1,$rounds
lea 0($key),$key_
movdqu ($cmac),$inout1
movdqa $iv,$inout0
mov $rounds,$rnds_
pshufb $bswap_mask,$iv
jmp .Lccm64_enc_outer
.align 16
.Lccm64_enc_outer:
$movkey ($key_),$rndkey0
mov $rnds_,$rounds
movups ($inp),$in0 # load inp
xorps $rndkey0,$inout0 # counter
$movkey 16($key_),$rndkey1
xorps $in0,$rndkey0
lea 32($key_),$key
xorps $rndkey0,$inout1 # cmac^=inp
$movkey ($key),$rndkey0
.Lccm64_enc2_loop:
aesenc $rndkey1,$inout0
dec $rounds
aesenc $rndkey1,$inout1
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
lea 32($key),$key
aesenc $rndkey0,$inout1
$movkey 0($key),$rndkey0
jnz .Lccm64_enc2_loop
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
paddq $increment,$iv
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
dec $len
lea 16($inp),$inp
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
movups $in0,($out) # save output
lea 16($out),$out
pshufb $bswap_mask,$inout0
jnz .Lccm64_enc_outer
movups $inout1,($cmac)
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
lea 0x58(%rsp),%rsp
.Lccm64_enc_ret:
___
$code.=<<___;
ret
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
___
######################################################################
$code.=<<___;
.globl aesni_ccm64_decrypt_blocks
.type aesni_ccm64_decrypt_blocks,\@function,6
.align 16
aesni_ccm64_decrypt_blocks:
___
$code.=<<___ if ($win64);
lea -0x58(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
movaps %xmm9,0x30(%rsp)
.Lccm64_dec_body:
___
$code.=<<___;
mov 240($key),$rounds # key->rounds
movups ($ivp),$iv
movdqu ($cmac),$inout1
movdqa .Lincrement64(%rip),$increment
movdqa .Lbswap_mask(%rip),$bswap_mask
movaps $iv,$inout0
mov $rounds,$rnds_
mov $key,$key_
pshufb $bswap_mask,$iv
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
movups ($inp),$in0 # load inp
paddq $increment,$iv
lea 16($inp),$inp
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_outer:
xorps $inout0,$in0 # inp ^= E(iv)
movdqa $iv,$inout0
mov $rnds_,$rounds
movups $in0,($out) # save output
lea 16($out),$out
pshufb $bswap_mask,$inout0
sub \$1,$len
jz .Lccm64_dec_break
$movkey ($key_),$rndkey0
shr \$1,$rounds
$movkey 16($key_),$rndkey1
xorps $rndkey0,$in0
lea 32($key_),$key
xorps $rndkey0,$inout0
xorps $in0,$inout1 # cmac^=out
$movkey ($key),$rndkey0
.Lccm64_dec2_loop:
aesenc $rndkey1,$inout0
dec $rounds
aesenc $rndkey1,$inout1
$movkey 16($key),$rndkey1
aesenc $rndkey0,$inout0
lea 32($key),$key
aesenc $rndkey0,$inout1
$movkey 0($key),$rndkey0
jnz .Lccm64_dec2_loop
movups ($inp),$in0 # load inp
paddq $increment,$iv
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
lea 16($inp),$inp
aesenclast $rndkey0,$inout0
aesenclast $rndkey0,$inout1
jmp .Lccm64_dec_outer
.align 16
.Lccm64_dec_break:
#xorps $in0,$inout1 # cmac^=out
___
&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
$code.=<<___;
movups $inout1,($cmac)
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
lea 0x58(%rsp),%rsp
.Lccm64_dec_ret:
___
$code.=<<___;
ret
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
___
}
######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
# Keywords are full unroll and modulo-schedule counter calculations
# with zero-round key xor.
{
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
my ($key0,$ctr)=("${key_}d","${ivp}d");
my $frame_size = 0x80 + ($win64?160:0);
$code.=<<___;
.globl aesni_ctr32_encrypt_blocks
.type aesni_ctr32_encrypt_blocks,\@function,5
.align 16
aesni_ctr32_encrypt_blocks:
lea (%rsp),%rax
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,-0xa8(%rax)
movaps %xmm7,-0x98(%rax)
movaps %xmm8,-0x88(%rax)
movaps %xmm9,-0x78(%rax)
movaps %xmm10,-0x68(%rax)
movaps %xmm11,-0x58(%rax)
movaps %xmm12,-0x48(%rax)
movaps %xmm13,-0x38(%rax)
movaps %xmm14,-0x28(%rax)
movaps %xmm15,-0x18(%rax)
.Lctr32_body:
___
$code.=<<___;
lea -8(%rax),%rbp
cmp \$1,$len
je .Lctr32_one_shortcut
movdqu ($ivp),$inout0
movdqu ($key),$rndkey0
mov 12($ivp),$ctr # counter LSB
pxor $rndkey0,$inout0
mov 12($key),$key0 # 0-round key LSB
movdqa $inout0,0x00(%rsp) # populate counter block
bswap $ctr
movdqa $inout0,$inout1
movdqa $inout0,$inout2
movdqa $inout0,$inout3
movdqa $inout0,0x40(%rsp)
movdqa $inout0,0x50(%rsp)
movdqa $inout0,0x60(%rsp)
movdqa $inout0,0x70(%rsp)
mov 240($key),$rounds # key->rounds
lea 1($ctr),%r9
lea 2($ctr),%r10
bswap %r9d
bswap %r10d
xor $key0,%r9d
xor $key0,%r10d
pinsrd \$3,%r9d,$inout1
lea 3($ctr),%r9
movdqa $inout1,0x10(%rsp)
pinsrd \$3,%r10d,$inout2
bswap %r9d
lea 4($ctr),%r10
movdqa $inout2,0x20(%rsp)
xor $key0,%r9d
bswap %r10d
pinsrd \$3,%r9d,$inout3
xor $key0,%r10d
movdqa $inout3,0x30(%rsp)
lea 5($ctr),%r9
mov %r10d,0x40+12(%rsp)
bswap %r9d
lea 6($ctr),%r10
xor $key0,%r9d
bswap %r10d
mov %r9d,0x50+12(%rsp)
xor $key0,%r10d
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
xor $key0,%r9d
mov %r9d,0x70+12(%rsp)
$movkey 0x10($key),$rndkey1
movdqa 0x40(%rsp),$inout4
movdqa 0x50(%rsp),$inout5
cmp \$8,$len
jb .Lctr32_tail
lea 0x80($key),$key # size optimization
sub \$8,$len
jmp .Lctr32_loop8
.align 32
.Lctr32_loop8:
add \$8,$ctr
movdqa 0x60(%rsp),$inout6
aesenc $rndkey1,$inout0
mov $ctr,%r9d
movdqa 0x70(%rsp),$inout7
aesenc $rndkey1,$inout1
bswap %r9d
$movkey 0x20-0x80($key),$rndkey0
aesenc $rndkey1,$inout2
xor $key0,%r9d
aesenc $rndkey1,$inout3
mov %r9d,0x00+12(%rsp)
lea 1($ctr),%r9
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 0x30-0x80($key),$rndkey1
___
for($i=2;$i<8;$i++) {
my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
$code.=<<___;
aesenc $rndkeyx,$inout0
aesenc $rndkeyx,$inout1
bswap %r9d
aesenc $rndkeyx,$inout2
xor $key0,%r9d
aesenc $rndkeyx,$inout3
mov %r9d,`0x10*($i-1)`+12(%rsp)
lea $i($ctr),%r9
aesenc $rndkeyx,$inout4
aesenc $rndkeyx,$inout5
aesenc $rndkeyx,$inout6
aesenc $rndkeyx,$inout7
$movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
___
}
$code.=<<___;
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
bswap %r9d
aesenc $rndkey0,$inout2
xor $key0,%r9d
aesenc $rndkey0,$inout3
mov %r9d,0x70+12(%rsp)
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
aesenc $rndkey0,$inout6
movdqu 0x00($inp),$in0
aesenc $rndkey0,$inout7
$movkey 0xa0-0x80($key),$rndkey0
cmp \$11,$rounds
jb .Lctr32_enc_done
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 0xb0-0x80($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
aesenc $rndkey0,$inout6
aesenc $rndkey0,$inout7
$movkey 0xc0-0x80($key),$rndkey0
je .Lctr32_enc_done
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 0xd0-0x80($key),$rndkey1
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
aesenc $rndkey0,$inout6
aesenc $rndkey0,$inout7
$movkey 0xe0-0x80($key),$rndkey0
.Lctr32_enc_done:
movdqu 0x10($inp),$in1
pxor $rndkey0,$in0
movdqu 0x20($inp),$in2
pxor $rndkey0,$in1
movdqu 0x30($inp),$in3
pxor $rndkey0,$in2
movdqu 0x40($inp),$in4
pxor $rndkey0,$in3
movdqu 0x50($inp),$in5
pxor $rndkey0,$in4
aesenc $rndkey1,$inout0
pxor $rndkey0,$in5
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
movdqu 0x60($inp),$rndkey1
aesenclast $in0,$inout0
pxor $rndkey0,$rndkey1
movdqu 0x70($inp),$in0
lea 0x80($inp),$inp
aesenclast $in1,$inout1
pxor $rndkey0,$in0
movdqa 0x00(%rsp),$in1 # load next counter block
aesenclast $in2,$inout2
movdqa 0x10(%rsp),$in2
aesenclast $in3,$inout3
movdqa 0x20(%rsp),$in3
aesenclast $in4,$inout4
movdqa 0x30(%rsp),$in4
aesenclast $in5,$inout5
movdqa 0x40(%rsp),$in5
aesenclast $rndkey1,$inout6
movdqa 0x50(%rsp),$rndkey0
aesenclast $in0,$inout7
$movkey 0x10-0x80($key),$rndkey1
movups $inout0,($out) # store output
movdqa $in1,$inout0
movups $inout1,0x10($out)
movdqa $in2,$inout1
movups $inout2,0x20($out)
movdqa $in3,$inout2
movups $inout3,0x30($out)
movdqa $in4,$inout3
movups $inout4,0x40($out)
movdqa $in5,$inout4
movups $inout5,0x50($out)
movdqa $rndkey0,$inout5
movups $inout6,0x60($out)
movups $inout7,0x70($out)
lea 0x80($out),$out
sub \$8,$len
jnc .Lctr32_loop8
add \$8,$len
jz .Lctr32_done
lea -0x80($key),$key
.Lctr32_tail:
lea 16($key),$key
cmp \$4,$len
jb .Lctr32_loop3
je .Lctr32_loop4
movdqa 0x60(%rsp),$inout6
pxor $inout7,$inout7
$movkey 16($key),$rndkey0
aesenc $rndkey1,$inout0
lea 16($key),$key
aesenc $rndkey1,$inout1
shr \$1,$rounds
aesenc $rndkey1,$inout2
dec $rounds
aesenc $rndkey1,$inout3
movups ($inp),$in0
aesenc $rndkey1,$inout4
movups 0x10($inp),$in1
aesenc $rndkey1,$inout5
movups 0x20($inp),$in2
aesenc $rndkey1,$inout6
$movkey 16($key),$rndkey1
call .Lenc_loop8_enter
movdqu 0x30($inp),$in3
pxor $in0,$inout0
movdqu 0x40($inp),$in0
pxor $in1,$inout1
movdqu $inout0,($out)
pxor $in2,$inout2
movdqu $inout1,0x10($out)
pxor $in3,$inout3
movdqu $inout2,0x20($out)
pxor $in0,$inout4
movdqu $inout3,0x30($out)
movdqu $inout4,0x40($out)
cmp \$6,$len
jb .Lctr32_done
movups 0x50($inp),$in1
xorps $in1,$inout5
movups $inout5,0x50($out)
je .Lctr32_done
movups 0x60($inp),$in2
xorps $in2,$inout6
movups $inout6,0x60($out)
jmp .Lctr32_done
.align 32
.Lctr32_loop4:
aesenc $rndkey1,$inout0
lea 16($key),$key
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
$movkey ($key),$rndkey1
dec $rounds
jnz .Lctr32_loop4
aesenclast $rndkey1,$inout0
movups ($inp),$in0
aesenclast $rndkey1,$inout1
movups 0x10($inp),$in1
aesenclast $rndkey1,$inout2
movups 0x20($inp),$in2
aesenclast $rndkey1,$inout3
movups 0x30($inp),$in3
xorps $in0,$inout0
movups $inout0,($out)
xorps $in1,$inout1
movups $inout1,0x10($out)
pxor $in2,$inout2
movdqu $inout2,0x20($out)
pxor $in3,$inout3
movdqu $inout3,0x30($out)
jmp .Lctr32_done
.align 32
.Lctr32_loop3:
aesenc $rndkey1,$inout0
lea 16($key),$key
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
$movkey ($key),$rndkey1
dec $rounds
jnz .Lctr32_loop3
aesenclast $rndkey1,$inout0
aesenclast $rndkey1,$inout1
aesenclast $rndkey1,$inout2
movups ($inp),$in0
xorps $in0,$inout0
movups $inout0,($out)
cmp \$2,$len
jb .Lctr32_done
movups 0x10($inp),$in1
xorps $in1,$inout1
movups $inout1,0x10($out)
je .Lctr32_done
movups 0x20($inp),$in2
xorps $in2,$inout2
movups $inout2,0x20($out)
jmp .Lctr32_done
.align 16
.Lctr32_one_shortcut:
movups ($ivp),$inout0
movups ($inp),$in0
mov 240($key),$rounds # key->rounds
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps $in0,$inout0
movups $inout0,($out)
jmp .Lctr32_done
.align 16
.Lctr32_done:
___
$code.=<<___ if ($win64);
movaps -0xa0(%rbp),%xmm6
movaps -0x90(%rbp),%xmm7
movaps -0x80(%rbp),%xmm8
movaps -0x70(%rbp),%xmm9
movaps -0x60(%rbp),%xmm10
movaps -0x50(%rbp),%xmm11
movaps -0x40(%rbp),%xmm12
movaps -0x30(%rbp),%xmm13
movaps -0x20(%rbp),%xmm14
movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
pop %rbp
.Lctr32_epilogue:
ret
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
___
}
######################################################################
# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2
# const unsigned char iv[16]);
#
{
my @tweak=map("%xmm$_",(10..15));
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
my $frame_size = 0x70 + ($win64?160:0);
$code.=<<___;
.globl aesni_xts_encrypt
.type aesni_xts_encrypt,\@function,6
.align 16
aesni_xts_encrypt:
lea (%rsp),%rax
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,-0xa8(%rax)
movaps %xmm7,-0x98(%rax)
movaps %xmm8,-0x88(%rax)
movaps %xmm9,-0x78(%rax)
movaps %xmm10,-0x68(%rax)
movaps %xmm11,-0x58(%rax)
movaps %xmm12,-0x48(%rax)
movaps %xmm13,-0x38(%rax)
movaps %xmm14,-0x28(%rax)
movaps %xmm15,-0x18(%rax)
.Lxts_enc_body:
___
$code.=<<___;
lea -8(%rax),%rbp
movups ($ivp),@tweak[5] # load clear-text tweak
mov 240(%r8),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
$code.=<<___;
$movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
$movkey 16($key,$rnds_),$rndkey1 # last round key
mov $rounds,$rnds_
movdqa .Lxts_magic(%rip),$twmask
pshufd \$0x5f,@tweak[5],$twres
pxor $rndkey0,$rndkey1
___
# alternative tweak calculation algorithm is based on suggestions
# by Shay Gueron. psrad doesn't conflict with AES-NI instructions
# and should help in the future...
for ($i=0;$i<4;$i++) {
$code.=<<___;
movdqa $twres,$twtmp
paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
psrad \$31,$twtmp # broadcast upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
pxor $rndkey0,@tweak[$i]
pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
movdqa @tweak[5],@tweak[4]
psrad \$31,$twres
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
pxor $rndkey0,@tweak[4]
pxor $twres,@tweak[5]
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
sub \$16*6,$len
jc .Lxts_enc_short
shr \$1,$rounds
sub \$3,$rounds
$movkey 16($key_),$rndkey1
mov $rounds,$rnds_
lea .Lxts_magic(%rip),%r8
jmp .Lxts_enc_grandloop
.align 32
.Lxts_enc_grandloop:
movdqu `16*0`($inp),$inout0 # load input
movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
pxor @tweak[0],$inout0
movdqu `16*2`($inp),$inout2
pxor @tweak[1],$inout1
aesenc $rndkey1,$inout0
movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
aesenc $rndkey1,$inout1
movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
aesenc $rndkey1,$inout2
movdqu `16*5`($inp),$inout5
pxor @tweak[5],$twmask # round[0]^=tweak[5]
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
aesenc $rndkey1,$inout3
$movkey 32($key_),$rndkey0
lea `16*6`($inp),$inp
pxor $twmask,$inout5
pxor $twres,@tweak[0]
aesenc $rndkey1,$inout4
pxor $twres,@tweak[1]
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesenc $rndkey1,$inout5
$movkey 48($key_),$rndkey1
aesenc $rndkey0,$inout0
pxor $twres,@tweak[2]
movdqa @tweak[1],`16*1`(%rsp)
aesenc $rndkey0,$inout1
pxor $twres,@tweak[3]
movdqa @tweak[2],`16*2`(%rsp)
aesenc $rndkey0,$inout2
pxor $twres,@tweak[4]
aesenc $rndkey0,$inout3
pxor $twres,$twmask
movdqa @tweak[4],`16*4`(%rsp)
aesenc $rndkey0,$inout4
movdqa $twmask,`16*5`(%rsp)
aesenc $rndkey0,$inout5
$movkey 64($key_),$rndkey0
lea 64($key_),$key
pshufd \$0x5f,@tweak[5],$twres
jmp .Lxts_enc_loop6
.align 32
.Lxts_enc_loop6:
aesenc $rndkey1,$inout0
aesenc $rndkey1,$inout1
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
$movkey 16($key),$rndkey1
lea 32($key),$key
aesenc $rndkey0,$inout0
aesenc $rndkey0,$inout1
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
jnz .Lxts_enc_loop6
movdqa (%r8),$twmask
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey1,$inout0
paddq @tweak[5],@tweak[5]
psrad \$31,$twtmp
aesenc $rndkey1,$inout1
pand $twmask,$twtmp
$movkey ($key_),@tweak[0] # load round[0]
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
movaps @tweak[0],@tweak[1] # copy round[0]
aesenc $rndkey1,$inout5
$movkey 16($key),$rndkey1
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey0,$inout0
pxor @tweak[5],@tweak[0]
psrad \$31,$twtmp
aesenc $rndkey0,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey0,$inout4
movaps @tweak[1],@tweak[2]
aesenc $rndkey0,$inout5
$movkey 32($key),$rndkey0
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey1,$inout0
pxor @tweak[5],@tweak[1]
psrad \$31,$twtmp
aesenc $rndkey1,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesenc $rndkey1,$inout2
movdqa @tweak[3],`16*3`(%rsp)
aesenc $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey1,$inout4
movaps @tweak[2],@tweak[3]
aesenc $rndkey1,$inout5
$movkey 48($key),$rndkey1
movdqa $twres,$twtmp
paddd $twres,$twres
aesenc $rndkey0,$inout0
pxor @tweak[5],@tweak[2]
psrad \$31,$twtmp
aesenc $rndkey0,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesenc $rndkey0,$inout2
aesenc $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesenc $rndkey0,$inout4
movaps @tweak[3],@tweak[4]
aesenc $rndkey0,$inout5
movdqa $twres,$rndkey0
paddd $twres,$twres
aesenc $rndkey1,$inout0
pxor @tweak[5],@tweak[3]
psrad \$31,$rndkey0
aesenc $rndkey1,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$rndkey0
aesenc $rndkey1,$inout2
aesenc $rndkey1,$inout3
pxor $rndkey0,@tweak[5]
$movkey ($key_),$rndkey0
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
$movkey 16($key_),$rndkey1
pxor @tweak[5],@tweak[4]
psrad \$31,$twres
aesenclast `16*0`(%rsp),$inout0
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
aesenclast `16*1`(%rsp),$inout1
aesenclast `16*2`(%rsp),$inout2
pxor $twres,@tweak[5]
aesenclast `16*3`(%rsp),$inout3
aesenclast `16*4`(%rsp),$inout4
aesenclast `16*5`(%rsp),$inout5
mov $rnds_,$rounds # restore $rounds
lea `16*6`($out),$out
movups $inout0,`-16*6`($out) # write output
movups $inout1,`-16*5`($out)
movups $inout2,`-16*4`($out)
movups $inout3,`-16*3`($out)
movups $inout4,`-16*2`($out)
movups $inout5,`-16*1`($out)
sub \$16*6,$len
jnc .Lxts_enc_grandloop
lea 7($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_enc_short:
pxor $rndkey0,@tweak[0]
add \$16*6,$len
jz .Lxts_enc_done
pxor $rndkey0,@tweak[1]
cmp \$0x20,$len
jb .Lxts_enc_one
pxor $rndkey0,@tweak[2]
je .Lxts_enc_two
pxor $rndkey0,@tweak[3]
cmp \$0x40,$len
jb .Lxts_enc_three
pxor $rndkey0,@tweak[4]
je .Lxts_enc_four
movdqu ($inp),$inout0
movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
lea 16*5($inp),$inp
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
call _aesni_encrypt6
xorps @tweak[0],$inout0
movdqa @tweak[5],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movdqu $inout0,($out)
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout2,16*2($out)
movdqu $inout3,16*3($out)
movdqu $inout4,16*4($out)
lea 16*5($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_one:
movups ($inp),$inout0
lea 16*1($inp),$inp
xorps @tweak[0],$inout0
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
movups $inout0,($out)
lea 16*1($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_two:
movups ($inp),$inout0
movups 16($inp),$inout1
lea 32($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
call _aesni_encrypt3
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
movups $inout0,($out)
movups $inout1,16*1($out)
lea 16*2($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_three:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
lea 16*3($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
call _aesni_encrypt3
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movups $inout0,($out)
movups $inout1,16*1($out)
movups $inout2,16*2($out)
lea 16*3($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_four:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
lea 16*4($inp),$inp
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_encrypt4
pxor @tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
pxor @tweak[1],$inout1
pxor @tweak[2],$inout2
movdqu $inout0,($out)
pxor @tweak[3],$inout3
movdqu $inout1,16*1($out)
movdqu $inout2,16*2($out)
movdqu $inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_enc_done
.align 16
.Lxts_enc_done:
and \$15,$len_
jz .Lxts_enc_ret
mov $len_,$len
.Lxts_enc_steal:
movzb ($inp),%eax # borrow $rounds ...
movzb -16($out),%ecx # ... and $key
lea 1($inp),$inp
mov %al,-16($out)
mov %cl,0($out)
lea 1($out),$out
sub \$1,$len
jnz .Lxts_enc_steal
sub $len_,$out # rewind $out
mov $key_,$key # restore $key
mov $rnds_,$rounds # restore $rounds
movups -16($out),$inout0
xorps @tweak[0],$inout0
___
&aesni_generate1("enc",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movups $inout0,-16($out)
.Lxts_enc_ret:
___
$code.=<<___ if ($win64);
movaps -0xa0(%rbp),%xmm6
movaps -0x90(%rbp),%xmm7
movaps -0x80(%rbp),%xmm8
movaps -0x70(%rbp),%xmm9
movaps -0x60(%rbp),%xmm10
movaps -0x50(%rbp),%xmm11
movaps -0x40(%rbp),%xmm12
movaps -0x30(%rbp),%xmm13
movaps -0x20(%rbp),%xmm14
movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
pop %rbp
.Lxts_enc_epilogue:
ret
.size aesni_xts_encrypt,.-aesni_xts_encrypt
___
$code.=<<___;
.globl aesni_xts_decrypt
.type aesni_xts_decrypt,\@function,6
.align 16
aesni_xts_decrypt:
lea (%rsp),%rax
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,-0xa8(%rax)
movaps %xmm7,-0x98(%rax)
movaps %xmm8,-0x88(%rax)
movaps %xmm9,-0x78(%rax)
movaps %xmm10,-0x68(%rax)
movaps %xmm11,-0x58(%rax)
movaps %xmm12,-0x48(%rax)
movaps %xmm13,-0x38(%rax)
movaps %xmm14,-0x28(%rax)
movaps %xmm15,-0x18(%rax)
.Lxts_dec_body:
___
$code.=<<___;
lea -8(%rax),%rbp
movups ($ivp),@tweak[5] # load clear-text tweak
mov 240($key2),$rounds # key2->rounds
mov 240($key),$rnds_ # key1->rounds
___
# generate the tweak
&aesni_generate1("enc",$key2,$rounds,@tweak[5]);
$code.=<<___;
xor %eax,%eax # if ($len%16) len-=16;
test \$15,$len
setnz %al
shl \$4,%rax
sub %rax,$len
$movkey ($key),$rndkey0 # zero round key
mov $key,$key_ # backup $key
mov $rnds_,$rounds # backup $rounds
shl \$4,$rnds_
mov $len,$len_ # backup $len
and \$-16,$len
$movkey 16($key,$rnds_),$rndkey1 # last round key
mov $rounds,$rnds_
movdqa .Lxts_magic(%rip),$twmask
pshufd \$0x5f,@tweak[5],$twres
pxor $rndkey0,$rndkey1
___
for ($i=0;$i<4;$i++) {
$code.=<<___;
movdqa $twres,$twtmp
paddd $twres,$twres
movdqa @tweak[5],@tweak[$i]
psrad \$31,$twtmp # broadcast upper bits
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
pxor $rndkey0,@tweak[$i]
pxor $twtmp,@tweak[5]
___
}
$code.=<<___;
movdqa @tweak[5],@tweak[4]
psrad \$31,$twres
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
pxor $rndkey0,@tweak[4]
pxor $twres,@tweak[5]
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
sub \$16*6,$len
jc .Lxts_dec_short
shr \$1,$rounds
sub \$3,$rounds
$movkey 16($key_),$rndkey1
mov $rounds,$rnds_
lea .Lxts_magic(%rip),%r8
jmp .Lxts_dec_grandloop
.align 32
.Lxts_dec_grandloop:
movdqu `16*0`($inp),$inout0 # load input
movdqa $rndkey0,$twmask
movdqu `16*1`($inp),$inout1
pxor @tweak[0],$inout0
movdqu `16*2`($inp),$inout2
pxor @tweak[1],$inout1
aesdec $rndkey1,$inout0
movdqu `16*3`($inp),$inout3
pxor @tweak[2],$inout2
aesdec $rndkey1,$inout1
movdqu `16*4`($inp),$inout4
pxor @tweak[3],$inout3
aesdec $rndkey1,$inout2
movdqu `16*5`($inp),$inout5
pxor @tweak[5],$twmask # round[0]^=tweak[5]
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
pxor @tweak[4],$inout4
aesdec $rndkey1,$inout3
$movkey 32($key_),$rndkey0
lea `16*6`($inp),$inp
pxor $twmask,$inout5
pxor $twres,@tweak[0]
aesdec $rndkey1,$inout4
pxor $twres,@tweak[1]
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
aesdec $rndkey1,$inout5
$movkey 48($key_),$rndkey1
aesdec $rndkey0,$inout0
pxor $twres,@tweak[2]
movdqa @tweak[1],`16*1`(%rsp)
aesdec $rndkey0,$inout1
pxor $twres,@tweak[3]
movdqa @tweak[2],`16*2`(%rsp)
aesdec $rndkey0,$inout2
pxor $twres,@tweak[4]
aesdec $rndkey0,$inout3
pxor $twres,$twmask
movdqa @tweak[4],`16*4`(%rsp)
aesdec $rndkey0,$inout4
movdqa $twmask,`16*5`(%rsp)
aesdec $rndkey0,$inout5
$movkey 64($key_),$rndkey0
lea 64($key_),$key
pshufd \$0x5f,@tweak[5],$twres
jmp .Lxts_dec_loop6
.align 32
.Lxts_dec_loop6:
aesdec $rndkey1,$inout0
aesdec $rndkey1,$inout1
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
$movkey 16($key),$rndkey1
lea 32($key),$key
aesdec $rndkey0,$inout0
aesdec $rndkey0,$inout1
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
aesdec $rndkey0,$inout4
aesdec $rndkey0,$inout5
$movkey ($key),$rndkey0
dec $rounds
jnz .Lxts_dec_loop6
movdqa (%r8),$twmask
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey1,$inout0
paddq @tweak[5],@tweak[5]
psrad \$31,$twtmp
aesdec $rndkey1,$inout1
pand $twmask,$twtmp
$movkey ($key_),@tweak[0] # load round[0]
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
movaps @tweak[0],@tweak[1] # copy round[0]
aesdec $rndkey1,$inout5
$movkey 16($key),$rndkey1
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey0,$inout0
pxor @tweak[5],@tweak[0]
psrad \$31,$twtmp
aesdec $rndkey0,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey0,$inout4
movaps @tweak[1],@tweak[2]
aesdec $rndkey0,$inout5
$movkey 32($key),$rndkey0
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey1,$inout0
pxor @tweak[5],@tweak[1]
psrad \$31,$twtmp
aesdec $rndkey1,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesdec $rndkey1,$inout2
movdqa @tweak[3],`16*3`(%rsp)
aesdec $rndkey1,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey1,$inout4
movaps @tweak[2],@tweak[3]
aesdec $rndkey1,$inout5
$movkey 48($key),$rndkey1
movdqa $twres,$twtmp
paddd $twres,$twres
aesdec $rndkey0,$inout0
pxor @tweak[5],@tweak[2]
psrad \$31,$twtmp
aesdec $rndkey0,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$twtmp
aesdec $rndkey0,$inout2
aesdec $rndkey0,$inout3
pxor $twtmp,@tweak[5]
aesdec $rndkey0,$inout4
movaps @tweak[3],@tweak[4]
aesdec $rndkey0,$inout5
movdqa $twres,$rndkey0
paddd $twres,$twres
aesdec $rndkey1,$inout0
pxor @tweak[5],@tweak[3]
psrad \$31,$rndkey0
aesdec $rndkey1,$inout1
paddq @tweak[5],@tweak[5]
pand $twmask,$rndkey0
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
pxor $rndkey0,@tweak[5]
$movkey ($key_),$rndkey0
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
$movkey 16($key_),$rndkey1
pxor @tweak[5],@tweak[4]
psrad \$31,$twres
aesdeclast `16*0`(%rsp),$inout0
paddq @tweak[5],@tweak[5]
pand $twmask,$twres
aesdeclast `16*1`(%rsp),$inout1
aesdeclast `16*2`(%rsp),$inout2
pxor $twres,@tweak[5]
aesdeclast `16*3`(%rsp),$inout3
aesdeclast `16*4`(%rsp),$inout4
aesdeclast `16*5`(%rsp),$inout5
mov $rnds_,$rounds # restore $rounds
lea `16*6`($out),$out
movups $inout0,`-16*6`($out) # write output
movups $inout1,`-16*5`($out)
movups $inout2,`-16*4`($out)
movups $inout3,`-16*3`($out)
movups $inout4,`-16*2`($out)
movups $inout5,`-16*1`($out)
sub \$16*6,$len
jnc .Lxts_dec_grandloop
lea 7($rounds,$rounds),$rounds # restore original value
mov $key_,$key # restore $key
mov $rounds,$rnds_ # backup $rounds
.Lxts_dec_short:
pxor $rndkey0,@tweak[0]
pxor $rndkey0,@tweak[1]
add \$16*6,$len
jz .Lxts_dec_done
pxor $rndkey0,@tweak[2]
cmp \$0x20,$len
jb .Lxts_dec_one
pxor $rndkey0,@tweak[3]
je .Lxts_dec_two
pxor $rndkey0,@tweak[4]
cmp \$0x40,$len
jb .Lxts_dec_three
je .Lxts_dec_four
movdqu ($inp),$inout0
movdqu 16*1($inp),$inout1
movdqu 16*2($inp),$inout2
pxor @tweak[0],$inout0
movdqu 16*3($inp),$inout3
pxor @tweak[1],$inout1
movdqu 16*4($inp),$inout4
lea 16*5($inp),$inp
pxor @tweak[2],$inout2
pxor @tweak[3],$inout3
pxor @tweak[4],$inout4
call _aesni_decrypt6
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
movdqu $inout0,($out)
xorps @tweak[3],$inout3
movdqu $inout1,16*1($out)
xorps @tweak[4],$inout4
movdqu $inout2,16*2($out)
pxor $twtmp,$twtmp
movdqu $inout3,16*3($out)
pcmpgtd @tweak[5],$twtmp
movdqu $inout4,16*4($out)
lea 16*5($out),$out
pshufd \$0x13,$twtmp,@tweak[1] # $twres
and \$15,$len_
jz .Lxts_dec_ret
movdqa @tweak[5],@tweak[0]
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
pand $twmask,@tweak[1] # isolate carry and residue
pxor @tweak[5],@tweak[1]
jmp .Lxts_dec_done2
.align 16
.Lxts_dec_one:
movups ($inp),$inout0
lea 16*1($inp),$inp
xorps @tweak[0],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movdqa @tweak[1],@tweak[0]
movups $inout0,($out)
movdqa @tweak[2],@tweak[1]
lea 16*1($out),$out
jmp .Lxts_dec_done
.align 16
.Lxts_dec_two:
movups ($inp),$inout0
movups 16($inp),$inout1
lea 32($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
call _aesni_decrypt3
xorps @tweak[0],$inout0
movdqa @tweak[2],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[3],@tweak[1]
movups $inout0,($out)
movups $inout1,16*1($out)
lea 16*2($out),$out
jmp .Lxts_dec_done
.align 16
.Lxts_dec_three:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
lea 16*3($inp),$inp
xorps @tweak[0],$inout0
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
call _aesni_decrypt3
xorps @tweak[0],$inout0
movdqa @tweak[3],@tweak[0]
xorps @tweak[1],$inout1
movdqa @tweak[4],@tweak[1]
xorps @tweak[2],$inout2
movups $inout0,($out)
movups $inout1,16*1($out)
movups $inout2,16*2($out)
lea 16*3($out),$out
jmp .Lxts_dec_done
.align 16
.Lxts_dec_four:
movups ($inp),$inout0
movups 16*1($inp),$inout1
movups 16*2($inp),$inout2
xorps @tweak[0],$inout0
movups 16*3($inp),$inout3
lea 16*4($inp),$inp
xorps @tweak[1],$inout1
xorps @tweak[2],$inout2
xorps @tweak[3],$inout3
call _aesni_decrypt4
pxor @tweak[0],$inout0
movdqa @tweak[4],@tweak[0]
pxor @tweak[1],$inout1
movdqa @tweak[5],@tweak[1]
pxor @tweak[2],$inout2
movdqu $inout0,($out)
pxor @tweak[3],$inout3
movdqu $inout1,16*1($out)
movdqu $inout2,16*2($out)
movdqu $inout3,16*3($out)
lea 16*4($out),$out
jmp .Lxts_dec_done
.align 16
.Lxts_dec_done:
and \$15,$len_
jz .Lxts_dec_ret
.Lxts_dec_done2:
mov $len_,$len
mov $key_,$key # restore $key
mov $rnds_,$rounds # restore $rounds
movups ($inp),$inout0
xorps @tweak[1],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[1],$inout0
movups $inout0,($out)
.Lxts_dec_steal:
movzb 16($inp),%eax # borrow $rounds ...
movzb ($out),%ecx # ... and $key
lea 1($inp),$inp
mov %al,($out)
mov %cl,16($out)
lea 1($out),$out
sub \$1,$len
jnz .Lxts_dec_steal
sub $len_,$out # rewind $out
mov $key_,$key # restore $key
mov $rnds_,$rounds # restore $rounds
movups ($out),$inout0
xorps @tweak[0],$inout0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps @tweak[0],$inout0
movups $inout0,($out)
.Lxts_dec_ret:
___
$code.=<<___ if ($win64);
movaps -0xa0(%rbp),%xmm6
movaps -0x90(%rbp),%xmm7
movaps -0x80(%rbp),%xmm8
movaps -0x70(%rbp),%xmm9
movaps -0x60(%rbp),%xmm10
movaps -0x50(%rbp),%xmm11
movaps -0x40(%rbp),%xmm12
movaps -0x30(%rbp),%xmm13
movaps -0x20(%rbp),%xmm14
movaps -0x10(%rbp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
pop %rbp
.Lxts_dec_epilogue:
ret
.size aesni_xts_decrypt,.-aesni_xts_decrypt
___
} }}
########################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
{
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
my $inp_=$key_;
$code.=<<___;
.globl ${PREFIX}_cbc_encrypt
.type ${PREFIX}_cbc_encrypt,\@function,6
.align 16
${PREFIX}_cbc_encrypt:
test $len,$len # check length
jz .Lcbc_ret
mov 240($key),$rnds_ # key->rounds
mov $key,$key_ # backup $key
test %r9d,%r9d # 6th argument
jz .Lcbc_decrypt
#--------------------------- CBC ENCRYPT ------------------------------#
movups ($ivp),$inout0 # load iv as initial state
mov $rnds_,$rounds
cmp \$16,$len
jb .Lcbc_enc_tail
sub \$16,$len
jmp .Lcbc_enc_loop
.align 16
.Lcbc_enc_loop:
movups ($inp),$inout1 # load input
lea 16($inp),$inp
#xorps $inout1,$inout0
___
&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
$code.=<<___;
mov $rnds_,$rounds # restore $rounds
mov $key_,$key # restore $key
movups $inout0,0($out) # store output
lea 16($out),$out
sub \$16,$len
jnc .Lcbc_enc_loop
add \$16,$len
jnz .Lcbc_enc_tail
movups $inout0,($ivp)
jmp .Lcbc_ret
.Lcbc_enc_tail:
mov $len,%rcx # zaps $key
xchg $inp,$out # $inp is %rsi and $out is %rdi now
.long 0x9066A4F3 # rep movsb
mov \$16,%ecx # zero tail
sub $len,%rcx
xor %eax,%eax
.long 0x9066AAF3 # rep stosb
lea -16(%rdi),%rdi # rewind $out by 1 block
mov $rnds_,$rounds # restore $rounds
mov %rdi,%rsi # $inp and $out are the same
mov $key_,$key # restore $key
xor $len,$len # len=16
jmp .Lcbc_enc_loop # one more spin
#--------------------------- CBC DECRYPT ------------------------------#
.align 16
.Lcbc_decrypt:
lea (%rsp),%rax
push %rbp
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,0x10(%rsp)
movaps %xmm7,0x20(%rsp)
movaps %xmm8,0x30(%rsp)
movaps %xmm9,0x40(%rsp)
movaps %xmm10,0x50(%rsp)
movaps %xmm11,0x60(%rsp)
movaps %xmm12,0x70(%rsp)
movaps %xmm13,0x80(%rsp)
movaps %xmm14,0x90(%rsp)
movaps %xmm15,0xa0(%rsp)
.Lcbc_decrypt_body:
___
$code.=<<___;
lea -8(%rax),%rbp
movups ($ivp),$iv
mov $rnds_,$rounds
cmp \$0x50,$len
jbe .Lcbc_dec_tail
$movkey ($key),$rndkey0
movdqu 0x00($inp),$inout0 # load input
movdqu 0x10($inp),$inout1
movdqa $inout0,$in0
movdqu 0x20($inp),$inout2
movdqa $inout1,$in1
movdqu 0x30($inp),$inout3
movdqa $inout2,$in2
movdqu 0x40($inp),$inout4
movdqa $inout3,$in3
movdqu 0x50($inp),$inout5
movdqa $inout4,$in4
cmp \$0x70,$len
jbe .Lcbc_dec_six_or_seven
sub \$0x70,$len
lea 0x70($key),$key # size optimization
jmp .Lcbc_dec_loop8_enter
.align 16
.Lcbc_dec_loop8:
movups $inout7,($out)
lea 0x10($out),$out
.Lcbc_dec_loop8_enter:
movdqu 0x60($inp),$inout6
pxor $rndkey0,$inout0
movdqu 0x70($inp),$inout7
pxor $rndkey0,$inout1
$movkey 0x10-0x70($key),$rndkey1
pxor $rndkey0,$inout2
xor $inp_,$inp_
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
pxor $rndkey0,$inout3
pxor $rndkey0,$inout4
pxor $rndkey0,$inout5
pxor $rndkey0,$inout6
aesdec $rndkey1,$inout0
pxor $rndkey0,$inout7
$movkey 0x20-0x70($key),$rndkey0
aesdec $rndkey1,$inout1
aesdec $rndkey1,$inout2
aesdec $rndkey1,$inout3
aesdec $rndkey1,$inout4
aesdec $rndkey1,$inout5
setnc ${inp_}b
aesdec $rndkey1,$inout6
shl \$7,$inp_
aesdec $rndkey1,$inout7
add $inp,$inp_
$movkey 0x30-0x70($key),$rndkey1
___
for($i=1;$i<12;$i++) {
my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
$code.=<<___;
aesdec $rndkeyx,$inout0
aesdec $rndkeyx,$inout1
aesdec $rndkeyx,$inout2
aesdec $rndkeyx,$inout3
aesdec $rndkeyx,$inout4
aesdec $rndkeyx,$inout5
aesdec $rndkeyx,$inout6
aesdec $rndkeyx,$inout7
$movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
___
$code.=<<___ if ($i==7);
cmp \$11,$rounds
jb .Lcbc_dec_done
___
$code.=<<___ if ($i==9);
je .Lcbc_dec_done
___
}
$code.=<<___;
.Lcbc_dec_done:
aesdec $rndkey1,$inout0
pxor $rndkey0,$iv
aesdec $rndkey1,$inout1
pxor $rndkey0,$in0
aesdec $rndkey1,$inout2
pxor $rndkey0,$in1
aesdec $rndkey1,$inout3
pxor $rndkey0,$in2
aesdec $rndkey1,$inout4
pxor $rndkey0,$in3
aesdec $rndkey1,$inout5
pxor $rndkey0,$in4
aesdec $rndkey1,$inout6
aesdec $rndkey1,$inout7
movdqu 0x50($inp),$rndkey1
aesdeclast $iv,$inout0
movdqu 0x60($inp),$iv # borrow $iv
pxor $rndkey0,$rndkey1
aesdeclast $in0,$inout1
pxor $rndkey0,$iv
movdqu 0x70($inp),$rndkey0 # next IV
lea 0x80($inp),$inp
aesdeclast $in1,$inout2
movdqu 0x00($inp_),$in0
aesdeclast $in2,$inout3
movdqu 0x10($inp_),$in1
aesdeclast $in3,$inout4
movdqu 0x20($inp_),$in2
aesdeclast $in4,$inout5
movdqu 0x30($inp_),$in3
aesdeclast $rndkey1,$inout6
movdqu 0x40($inp_),$in4
aesdeclast $iv,$inout7
movdqa $rndkey0,$iv # return $iv
movdqu 0x50($inp_),$rndkey1
$movkey -0x70($key),$rndkey0
movups $inout0,($out) # store output
movdqa $in0,$inout0
movups $inout1,0x10($out)
movdqa $in1,$inout1
movups $inout2,0x20($out)
movdqa $in2,$inout2
movups $inout3,0x30($out)
movdqa $in3,$inout3
movups $inout4,0x40($out)
movdqa $in4,$inout4
movups $inout5,0x50($out)
movdqa $rndkey1,$inout5
movups $inout6,0x60($out)
lea 0x70($out),$out
sub \$0x80,$len
ja .Lcbc_dec_loop8
movaps $inout7,$inout0
lea -0x70($key),$key
add \$0x70,$len
jle .Lcbc_dec_tail_collected
movups $inout7,($out)
lea 0x10($out),$out
cmp \$0x50,$len
jbe .Lcbc_dec_tail
movaps $in0,$inout0
.Lcbc_dec_six_or_seven:
cmp \$0x60,$len
ja .Lcbc_dec_seven
movaps $inout5,$inout6
call _aesni_decrypt6
pxor $iv,$inout0 # ^= IV
movaps $inout6,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
movdqu $inout3,0x30($out)
pxor $in4,$inout5
movdqu $inout4,0x40($out)
lea 0x50($out),$out
movdqa $inout5,$inout0
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_seven:
movups 0x60($inp),$inout6
xorps $inout7,$inout7
call _aesni_decrypt8
movups 0x50($inp),$inout7
pxor $iv,$inout0 # ^= IV
movups 0x60($inp),$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
movdqu $inout3,0x30($out)
pxor $in4,$inout5
movdqu $inout4,0x40($out)
pxor $inout7,$inout6
movdqu $inout5,0x50($out)
lea 0x60($out),$out
movdqa $inout6,$inout0
jmp .Lcbc_dec_tail_collected
.Lcbc_dec_tail:
movups ($inp),$inout0
sub \$0x10,$len
jbe .Lcbc_dec_one
movups 0x10($inp),$inout1
movaps $inout0,$in0
sub \$0x10,$len
jbe .Lcbc_dec_two
movups 0x20($inp),$inout2
movaps $inout1,$in1
sub \$0x10,$len
jbe .Lcbc_dec_three
movups 0x30($inp),$inout3
movaps $inout2,$in2
sub \$0x10,$len
jbe .Lcbc_dec_four
movups 0x40($inp),$inout4
movaps $inout3,$in3
movaps $inout4,$in4
xorps $inout5,$inout5
call _aesni_decrypt6
pxor $iv,$inout0
movaps $in4,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
pxor $in3,$inout4
movdqu $inout3,0x30($out)
lea 0x40($out),$out
movdqa $inout4,$inout0
sub \$0x10,$len
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_one:
movaps $inout0,$in0
___
&aesni_generate1("dec",$key,$rounds);
$code.=<<___;
xorps $iv,$inout0
movaps $in0,$iv
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_two:
movaps $inout1,$in1
xorps $inout2,$inout2
call _aesni_decrypt3
pxor $iv,$inout0
movaps $in1,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
movdqa $inout1,$inout0
lea 0x10($out),$out
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_three:
movaps $inout2,$in2
call _aesni_decrypt3
pxor $iv,$inout0
movaps $in2,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
movdqa $inout2,$inout0
lea 0x20($out),$out
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_four:
movaps $inout3,$in3
call _aesni_decrypt4
pxor $iv,$inout0
movaps $in3,$iv
pxor $in0,$inout1
movdqu $inout0,($out)
pxor $in1,$inout2
movdqu $inout1,0x10($out)
pxor $in2,$inout3
movdqu $inout2,0x20($out)
movdqa $inout3,$inout0
lea 0x30($out),$out
jmp .Lcbc_dec_tail_collected
.align 16
.Lcbc_dec_tail_collected:
movups $iv,($ivp)
and \$15,$len
jnz .Lcbc_dec_tail_partial
movups $inout0,($out)
jmp .Lcbc_dec_ret
.align 16
.Lcbc_dec_tail_partial:
movaps $inout0,(%rsp)
mov \$16,%rcx
mov $out,%rdi
sub $len,%rcx
lea (%rsp),%rsi
.long 0x9066A4F3 # rep movsb
.Lcbc_dec_ret:
___
$code.=<<___ if ($win64);
movaps 0x10(%rsp),%xmm6
movaps 0x20(%rsp),%xmm7
movaps 0x30(%rsp),%xmm8
movaps 0x40(%rsp),%xmm9
movaps 0x50(%rsp),%xmm10
movaps 0x60(%rsp),%xmm11
movaps 0x70(%rsp),%xmm12
movaps 0x80(%rsp),%xmm13
movaps 0x90(%rsp),%xmm14
movaps 0xa0(%rsp),%xmm15
___
$code.=<<___;
lea (%rbp),%rsp
pop %rbp
.Lcbc_ret:
ret
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
___
}
# int $PREFIX_set_[en|de]crypt_key (const unsigned char *userKey,
# int bits, AES_KEY *key)
{ my ($inp,$bits,$key) = @_4args;
$bits =~ s/%r/%e/;
$code.=<<___;
.globl ${PREFIX}_set_decrypt_key
.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
.align 16
${PREFIX}_set_decrypt_key:
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
call __aesni_set_encrypt_key
shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
test %eax,%eax
jnz .Ldec_key_ret
lea 16($key,$bits),$inp # points at the end of key schedule
$movkey ($key),%xmm0 # just swap
$movkey ($inp),%xmm1
$movkey %xmm0,($inp)
$movkey %xmm1,($key)
lea 16($key),$key
lea -16($inp),$inp
.Ldec_key_inverse:
$movkey ($key),%xmm0 # swap and inverse
$movkey ($inp),%xmm1
aesimc %xmm0,%xmm0
aesimc %xmm1,%xmm1
lea 16($key),$key
lea -16($inp),$inp
$movkey %xmm0,16($inp)
$movkey %xmm1,-16($key)
cmp $key,$inp
ja .Ldec_key_inverse
$movkey ($key),%xmm0 # inverse middle
aesimc %xmm0,%xmm0
$movkey %xmm0,($inp)
.Ldec_key_ret:
add \$8,%rsp
ret
.LSEH_end_set_decrypt_key:
.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
___
# This is based on submission by
#
# Huang Ying <ying.huang@intel.com>
# Vinodh Gopal <vinodh.gopal@intel.com>
# Kahraman Akdemir
#
# Agressively optimized in respect to aeskeygenassist's critical path
# and is contained in %xmm0-5 to meet Win64 ABI requirement.
#
$code.=<<___;
.globl ${PREFIX}_set_encrypt_key
.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
.align 16
${PREFIX}_set_encrypt_key:
__aesni_set_encrypt_key:
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
mov \$-1,%rax
test $inp,$inp
jz .Lenc_key_ret
test $key,$key
jz .Lenc_key_ret
movups ($inp),%xmm0 # pull first 128 bits of *userKey
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
lea 16($key),%rax
cmp \$256,$bits
je .L14rounds
cmp \$192,$bits
je .L12rounds
cmp \$128,$bits
jne .Lbad_keybits
.L10rounds:
mov \$9,$bits # 10 rounds for 128-bit key
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
call .Lkey_expansion_128_cold
aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
call .Lkey_expansion_128
aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
call .Lkey_expansion_128
aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
call .Lkey_expansion_128
aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
call .Lkey_expansion_128
aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
call .Lkey_expansion_128
aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
call .Lkey_expansion_128
aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
call .Lkey_expansion_128
aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
call .Lkey_expansion_128
aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
call .Lkey_expansion_128
$movkey %xmm0,(%rax)
mov $bits,80(%rax) # 240(%rdx)
xor %eax,%eax
jmp .Lenc_key_ret
.align 16
.L12rounds:
movq 16($inp),%xmm2 # remaining 1/3 of *userKey
mov \$11,$bits # 12 rounds for 192
$movkey %xmm0,($key) # round 0
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
call .Lkey_expansion_192a_cold
aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
call .Lkey_expansion_192b
aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
call .Lkey_expansion_192a
aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
call .Lkey_expansion_192b
aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
call .Lkey_expansion_192a
aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
call .Lkey_expansion_192b
aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
call .Lkey_expansion_192a
aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
call .Lkey_expansion_192b
$movkey %xmm0,(%rax)
mov $bits,48(%rax) # 240(%rdx)
xor %rax, %rax
jmp .Lenc_key_ret
.align 16
.L14rounds:
movups 16($inp),%xmm2 # remaning half of *userKey
mov \$13,$bits # 14 rounds for 256
lea 16(%rax),%rax
$movkey %xmm0,($key) # round 0
$movkey %xmm2,16($key) # round 1
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
call .Lkey_expansion_256a_cold
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
call .Lkey_expansion_256b
aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
call .Lkey_expansion_256a
aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
call .Lkey_expansion_256b
aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
call .Lkey_expansion_256a
aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
call .Lkey_expansion_256b
aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
call .Lkey_expansion_256a
aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
call .Lkey_expansion_256b
aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
call .Lkey_expansion_256a
aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
call .Lkey_expansion_256b
aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
call .Lkey_expansion_256a
aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
call .Lkey_expansion_256b
aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
call .Lkey_expansion_256a
$movkey %xmm0,(%rax)
mov $bits,16(%rax) # 240(%rdx)
xor %rax,%rax
jmp .Lenc_key_ret
.align 16
.Lbad_keybits:
mov \$-2,%rax
.Lenc_key_ret:
add \$8,%rsp
ret
.LSEH_end_set_encrypt_key:
.align 16
.Lkey_expansion_128:
$movkey %xmm0,(%rax)
lea 16(%rax),%rax
.Lkey_expansion_128_cold:
shufps \$0b00010000,%xmm0,%xmm4
xorps %xmm4, %xmm0
shufps \$0b10001100,%xmm0,%xmm4
xorps %xmm4, %xmm0
shufps \$0b11111111,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm0
ret
.align 16
.Lkey_expansion_192a:
$movkey %xmm0,(%rax)
lea 16(%rax),%rax
.Lkey_expansion_192a_cold:
movaps %xmm2, %xmm5
.Lkey_expansion_192b_warm:
shufps \$0b00010000,%xmm0,%xmm4
movdqa %xmm2,%xmm3
xorps %xmm4,%xmm0
shufps \$0b10001100,%xmm0,%xmm4
pslldq \$4,%xmm3
xorps %xmm4,%xmm0
pshufd \$0b01010101,%xmm1,%xmm1 # critical path
pxor %xmm3,%xmm2
pxor %xmm1,%xmm0
pshufd \$0b11111111,%xmm0,%xmm3
pxor %xmm3,%xmm2
ret
.align 16
.Lkey_expansion_192b:
movaps %xmm0,%xmm3
shufps \$0b01000100,%xmm0,%xmm5
$movkey %xmm5,(%rax)
shufps \$0b01001110,%xmm2,%xmm3
$movkey %xmm3,16(%rax)
lea 32(%rax),%rax
jmp .Lkey_expansion_192b_warm
.align 16
.Lkey_expansion_256a:
$movkey %xmm2,(%rax)
lea 16(%rax),%rax
.Lkey_expansion_256a_cold:
shufps \$0b00010000,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps \$0b10001100,%xmm0,%xmm4
xorps %xmm4,%xmm0
shufps \$0b11111111,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm0
ret
.align 16
.Lkey_expansion_256b:
$movkey %xmm0,(%rax)
lea 16(%rax),%rax
shufps \$0b00010000,%xmm2,%xmm4
xorps %xmm4,%xmm2
shufps \$0b10001100,%xmm2,%xmm4
xorps %xmm4,%xmm2
shufps \$0b10101010,%xmm1,%xmm1 # critical path
xorps %xmm1,%xmm2
ret
.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
___
}
$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lincrement32:
.long 6,6,6,0
.Lincrement64:
.long 1,0,0,0
.Lxts_magic:
.long 0x87,0,1,0
.Lincrement1:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
___
$code.=<<___ if ($PREFIX eq "aesni");
.type ecb_se_handler,\@abi-omnipotent
.align 16
ecb_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
jmp .Lcommon_seh_tail
.size ecb_se_handler,.-ecb_se_handler
.type ccm64_se_handler,\@abi-omnipotent
.align 16
ccm64_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
lea 0(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0x58(%rax),%rax # adjust stack pointer
jmp .Lcommon_seh_tail
.size ccm64_se_handler,.-ccm64_se_handler
.type ctr_xts_se_handler,\@abi-omnipotent
.align 16
ctr_xts_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue lable
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
mov 160($context),%rax # pull context->Rbp
lea -0xa0(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
jmp .Lcommon_rbp_tail
.size ctr_xts_se_handler,.-ctr_xts_se_handler
___
$code.=<<___;
.type cbc_se_handler,\@abi-omnipotent
.align 16
cbc_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 152($context),%rax # pull context->Rsp
mov 248($context),%rbx # pull context->Rip
lea .Lcbc_decrypt(%rip),%r10
cmp %r10,%rbx # context->Rip<"prologue" label
jb .Lcommon_seh_tail
lea .Lcbc_decrypt_body(%rip),%r10
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
jb .Lrestore_cbc_rax
lea .Lcbc_ret(%rip),%r10
cmp %r10,%rbx # context->Rip>="epilogue" label
jae .Lcommon_seh_tail
lea 16(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
.Lcommon_rbp_tail:
mov 160($context),%rax # pull context->Rbp
mov (%rax),%rbp # restore saved %rbp
lea 8(%rax),%rax # adjust stack pointer
mov %rbp,160($context) # restore context->Rbp
jmp .Lcommon_seh_tail
.Lrestore_cbc_rax:
mov 120($context),%rax
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size cbc_se_handler,.-cbc_se_handler
.section .pdata
.align 4
___
$code.=<<___ if ($PREFIX eq "aesni");
.rva .LSEH_begin_aesni_ecb_encrypt
.rva .LSEH_end_aesni_ecb_encrypt
.rva .LSEH_info_ecb
.rva .LSEH_begin_aesni_ccm64_encrypt_blocks
.rva .LSEH_end_aesni_ccm64_encrypt_blocks
.rva .LSEH_info_ccm64_enc
.rva .LSEH_begin_aesni_ccm64_decrypt_blocks
.rva .LSEH_end_aesni_ccm64_decrypt_blocks
.rva .LSEH_info_ccm64_dec
.rva .LSEH_begin_aesni_ctr32_encrypt_blocks
.rva .LSEH_end_aesni_ctr32_encrypt_blocks
.rva .LSEH_info_ctr32
.rva .LSEH_begin_aesni_xts_encrypt
.rva .LSEH_end_aesni_xts_encrypt
.rva .LSEH_info_xts_enc
.rva .LSEH_begin_aesni_xts_decrypt
.rva .LSEH_end_aesni_xts_decrypt
.rva .LSEH_info_xts_dec
___
$code.=<<___;
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
.rva .LSEH_end_${PREFIX}_cbc_encrypt
.rva .LSEH_info_cbc
.rva ${PREFIX}_set_decrypt_key
.rva .LSEH_end_set_decrypt_key
.rva .LSEH_info_key
.rva ${PREFIX}_set_encrypt_key
.rva .LSEH_end_set_encrypt_key
.rva .LSEH_info_key
.section .xdata
.align 8
___
$code.=<<___ if ($PREFIX eq "aesni");
.LSEH_info_ecb:
.byte 9,0,0,0
.rva ecb_se_handler
.LSEH_info_ccm64_enc:
.byte 9,0,0,0
.rva ccm64_se_handler
.rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
.LSEH_info_ccm64_dec:
.byte 9,0,0,0
.rva ccm64_se_handler
.rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
.LSEH_info_ctr32:
.byte 9,0,0,0
.rva ctr_xts_se_handler
.rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
.LSEH_info_xts_enc:
.byte 9,0,0,0
.rva ctr_xts_se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.LSEH_info_xts_dec:
.byte 9,0,0,0
.rva ctr_xts_se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
___
$code.=<<___;
.LSEH_info_cbc:
.byte 9,0,0,0
.rva cbc_se_handler
.LSEH_info_key:
.byte 0x01,0x04,0x01,0x00
.byte 0x04,0x02,0x00,0x00 # sub rsp,8
___
}
sub rex {
local *opcode=shift;
my ($dst,$src)=@_;
my $rex=0;
$rex|=0x04 if($dst>=8);
$rex|=0x01 if($src>=8);
push @opcode,$rex|0x40 if($rex);
}
sub aesni {
my $line=shift;
my @opcode=(0x66);
if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
rex(\@opcode,$4,$3);
push @opcode,0x0f,0x3a,0xdf;
push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
my $c=$2;
push @opcode,$c=~/^0/?oct($c):$c;
return ".byte\t".join(',',@opcode);
}
elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my %opcodelet = (
"aesimc" => 0xdb,
"aesenc" => 0xdc, "aesenclast" => 0xdd,
"aesdec" => 0xde, "aesdeclast" => 0xdf
);
return undef if (!defined($opcodelet{$1}));
rex(\@opcode,$3,$2);
push @opcode,0x0f,0x38,$opcodelet{$1};
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
return ".byte\t".join(',',@opcode);
}
elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
my %opcodelet = (
"aesenc" => 0xdc, "aesenclast" => 0xdd,
"aesdec" => 0xde, "aesdeclast" => 0xdf
);
return undef if (!defined($opcodelet{$1}));
my $off = $2;
push @opcode,0x44 if ($3>=8);
push @opcode,0x0f,0x38,$opcodelet{$1};
push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
push @opcode,($off=~/^0/?oct($off):$off)&0xff;
return ".byte\t".join(',',@opcode);
}
return $line;
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
print $code;
close STDOUT;