#!/usr/bin/env perl

# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for Intel AES-NI extension. In
# OpenSSL context it's used with Intel engine, but can also be used as
# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
# details].
#
# Performance.
#
# To start with see corresponding paragraph in aesni-x86_64.pl...
# Instead of filling table similar to one found there I've chosen to
# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
# The simplified table below represents 32-bit performance relative
# to 64-bit one in every given point. Ratios vary for different
# encryption modes, therefore interval values.
#
#	16-byte     64-byte     256-byte    1-KB        8-KB
#	53-67%      67-84%      91-94%      95-98%      97-99.5%
#
# Lower ratios for smaller block sizes are perfectly understandable,
# because function call overhead is higher in 32-bit mode. Largest
# 8-KB block performance is virtually same: 32-bit code is less than
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.

# January 2011
#
# See aesni-x86_64.pl for details. Unlike x86_64 version this module
# interleaves at most 6 aes[enc|dec] instructions, because there are
# not enough registers for 8x interleave [which should be optimal for
# Sandy Bridge]. Actually, performance results for 6x interleave
# factor presented in aesni-x86_64.pl (except for CTR) are for this
# module.

# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.

######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
#		CBC en-/decrypt	CTR	XTS	ECB
# Westmere	3.77/1.37	1.37	1.52	1.27
# * Bridge	5.07/0.98	0.99	1.09	0.91
# Haswell	4.44/0.80	0.97	1.03	0.72
# Silvermont	5.77/3.56	3.67	4.03	3.46
# Bulldozer	5.80/0.98	1.05	1.24	0.93

$PREFIX="aesni";	# if $PREFIX is set to "AES", the script
			# generates drop-in replacement for
			# crypto/aes/asm/aes-586.pl:-)
$inline=1;		# inline _aesni_[en|de]crypt

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";

&asm_init($ARGV[0],$0);

&external_label("OPENSSL_ia32cap_P");
&static_label("key_const");

if ($PREFIX eq "aesni")	{ $movekey=\&movups; }
else			{ $movekey=\&movups; }

$len="eax";
$rounds="ecx";
$key="edx";
$inp="esi";
$out="edi";
$rounds_="ebx";	# backup copy for $rounds
$key_="ebp";	# backup copy for $key

$rndkey0="xmm0";
$rndkey1="xmm1";
$inout0="xmm2";
$inout1="xmm3";
$inout2="xmm4";
$inout3="xmm5";	$in1="xmm5";
$inout4="xmm6";	$in0="xmm6";
$inout5="xmm7";	$ivec="xmm7";

# AESNI extension
sub aeskeygenassist
{ my($dst,$src,$imm)=@_;
    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    {	&data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);	}
}
sub aescommon
{ my($opcodelet,$dst,$src)=@_;
    if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
    {	&data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
}
sub aesimc	{ aescommon(0xdb,@_); }
sub aesenc	{ aescommon(0xdc,@_); }
sub aesenclast	{ aescommon(0xdd,@_); }
sub aesdec	{ aescommon(0xde,@_); }
sub aesdeclast	{ aescommon(0xdf,@_); }

# Inline version of internal aesni_[en|de]crypt1
{ my $sn;
sub aesni_inline_generate1
{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
  $sn++;

    &$movekey		($rndkey0,&QWP(0,$key));
    &$movekey		($rndkey1,&QWP(16,$key));
    &xorps		($ivec,$rndkey0)	if (defined($ivec));
    &lea		($key,&DWP(32,$key));
    &xorps		($inout,$ivec)		if (defined($ivec));
    &xorps		($inout,$rndkey0)	if (!defined($ivec));
    &set_label("${p}1_loop_$sn");
	eval"&aes${p}	($inout,$rndkey1)";
	&dec		($rounds);
	&$movekey	($rndkey1,&QWP(0,$key));
	&lea		($key,&DWP(16,$key));
    &jnz		(&label("${p}1_loop_$sn"));
    eval"&aes${p}last	($inout,$rndkey1)";
}}

sub aesni_generate1	# fully unrolled loop
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));

    &function_begin_B("_aesni_${p}rypt1");
	&movups		($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey1,&QWP(0x10,$key));
	&xorps		($inout,$rndkey0);
	&$movekey	($rndkey0,&QWP(0x20,$key));
	&lea		($key,&DWP(0x30,$key));
	&cmp		($rounds,11);
	&jb		(&label("${p}128"));
	&lea		($key,&DWP(0x20,$key));
	&je		(&label("${p}192"));
	&lea		($key,&DWP(0x20,$key));
	eval"&aes${p}	($inout,$rndkey1)";
	&$movekey	($rndkey1,&QWP(-0x40,$key));
	eval"&aes${p}	($inout,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-0x30,$key));
    &set_label("${p}192");
	eval"&aes${p}	($inout,$rndkey1)";
	&$movekey	($rndkey1,&QWP(-0x20,$key));
	eval"&aes${p}	($inout,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-0x10,$key));
    &set_label("${p}128");
	eval"&aes${p}	($inout,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0,$key));
	eval"&aes${p}	($inout,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0x10,$key));
	eval"&aes${p}	($inout,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0x20,$key));
	eval"&aes${p}	($inout,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0x30,$key));
	eval"&aes${p}	($inout,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0x40,$key));
	eval"&aes${p}	($inout,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0x50,$key));
	eval"&aes${p}	($inout,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0x60,$key));
	eval"&aes${p}	($inout,$rndkey0)";
	&$movekey	($rndkey0,&QWP(0x70,$key));
	eval"&aes${p}	($inout,$rndkey1)";
    eval"&aes${p}last	($inout,$rndkey0)";
    &ret();
    &function_end_B("_aesni_${p}rypt1");
}

# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("enc") if (!$inline);
&function_begin_B("${PREFIX}_encrypt");
	&mov	("eax",&wparam(0));
	&mov	($key,&wparam(2));
	&movups	($inout0,&QWP(0,"eax"));
	&mov	($rounds,&DWP(240,$key));
	&mov	("eax",&wparam(1));
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&pxor	($rndkey0,$rndkey0);		# clear register bank
	&pxor	($rndkey1,$rndkey1);
	&movups	(&QWP(0,"eax"),$inout0);
	&pxor	($inout0,$inout0);
	&ret	();
&function_end_B("${PREFIX}_encrypt");

# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("dec") if(!$inline);
&function_begin_B("${PREFIX}_decrypt");
	&mov	("eax",&wparam(0));
	&mov	($key,&wparam(2));
	&movups	($inout0,&QWP(0,"eax"));
	&mov	($rounds,&DWP(240,$key));
	&mov	("eax",&wparam(1));
	if ($inline)
	{   &aesni_inline_generate1("dec");	}
	else
	{   &call	("_aesni_decrypt1");	}
	&pxor	($rndkey0,$rndkey0);		# clear register bank
	&pxor	($rndkey1,$rndkey1);
	&movups	(&QWP(0,"eax"),$inout0);
	&pxor	($inout0,$inout0);
	&ret	();
&function_end_B("${PREFIX}_decrypt");

# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it originally made no sense to implement 2x subroutine.
# But times change and it became appropriate to spend extra 192 bytes
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge, but it's unfeasible to accommodate such implementation
# in XMM registers addreassable in 32-bit mode and therefore maximum
# of 6x is used instead...

sub aesni_generate2
{ my $p=shift;

    &function_begin_B("_aesni_${p}rypt2");
	&$movekey	($rndkey0,&QWP(0,$key));
	&shl		($rounds,4);
	&$movekey	($rndkey1,&QWP(16,$key));
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&$movekey	($rndkey0,&QWP(32,$key));
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	&add		($rounds,16);

    &set_label("${p}2_loop");
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("${p}2_loop"));
    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
    eval"&aes${p}last	($inout0,$rndkey0)";
    eval"&aes${p}last	($inout1,$rndkey0)";
    &ret();
    &function_end_B("_aesni_${p}rypt2");
}

sub aesni_generate3
{ my $p=shift;

    &function_begin_B("_aesni_${p}rypt3");
	&$movekey	($rndkey0,&QWP(0,$key));
	&shl		($rounds,4);
	&$movekey	($rndkey1,&QWP(16,$key));
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&$movekey	($rndkey0,&QWP(32,$key));
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	&add		($rounds,16);

    &set_label("${p}3_loop");
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout2,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	eval"&aes${p}	($inout2,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("${p}3_loop"));
    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
    eval"&aes${p}	($inout2,$rndkey1)";
    eval"&aes${p}last	($inout0,$rndkey0)";
    eval"&aes${p}last	($inout1,$rndkey0)";
    eval"&aes${p}last	($inout2,$rndkey0)";
    &ret();
    &function_end_B("_aesni_${p}rypt3");
}

# 4x interleave is implemented to improve small block performance,
# most notably [and naturally] 4 block by ~30%. One can argue that one
# should have implemented 5x as well, but improvement  would be <20%,
# so it's not worth it...
sub aesni_generate4
{ my $p=shift;

    &function_begin_B("_aesni_${p}rypt4");
	&$movekey	($rndkey0,&QWP(0,$key));
	&$movekey	($rndkey1,&QWP(16,$key));
	&shl		($rounds,4);
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);
	&pxor		($inout2,$rndkey0);
	&pxor		($inout3,$rndkey0);
	&$movekey	($rndkey0,&QWP(32,$key));
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	&data_byte	(0x0f,0x1f,0x40,0x00);
	&add		($rounds,16);

    &set_label("${p}4_loop");
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout2,$rndkey1)";
	eval"&aes${p}	($inout3,$rndkey1)";
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout3,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    &jnz		(&label("${p}4_loop"));

    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
    eval"&aes${p}	($inout2,$rndkey1)";
    eval"&aes${p}	($inout3,$rndkey1)";
    eval"&aes${p}last	($inout0,$rndkey0)";
    eval"&aes${p}last	($inout1,$rndkey0)";
    eval"&aes${p}last	($inout2,$rndkey0)";
    eval"&aes${p}last	($inout3,$rndkey0)";
    &ret();
    &function_end_B("_aesni_${p}rypt4");
}

sub aesni_generate6
{ my $p=shift;

    &function_begin_B("_aesni_${p}rypt6");
    &static_label("_aesni_${p}rypt6_enter");
	&$movekey	($rndkey0,&QWP(0,$key));
	&shl		($rounds,4);
	&$movekey	($rndkey1,&QWP(16,$key));
	&xorps		($inout0,$rndkey0);
	&pxor		($inout1,$rndkey0);	# pxor does better here
	&pxor		($inout2,$rndkey0);
	eval"&aes${p}	($inout0,$rndkey1)";
	&pxor		($inout3,$rndkey0);
	&pxor		($inout4,$rndkey0);
	eval"&aes${p}	($inout1,$rndkey1)";
	&lea		($key,&DWP(32,$key,$rounds));
	&neg		($rounds);
	eval"&aes${p}	($inout2,$rndkey1)";
	&pxor		($inout5,$rndkey0);
	&$movekey	($rndkey0,&QWP(0,$key,$rounds));
	&add		($rounds,16);
	&jmp		(&label("_aesni_${p}rypt6_inner"));

    &set_label("${p}6_loop",16);
	eval"&aes${p}	($inout0,$rndkey1)";
	eval"&aes${p}	($inout1,$rndkey1)";
	eval"&aes${p}	($inout2,$rndkey1)";
    &set_label("_aesni_${p}rypt6_inner");
	eval"&aes${p}	($inout3,$rndkey1)";
	eval"&aes${p}	($inout4,$rndkey1)";
	eval"&aes${p}	($inout5,$rndkey1)";
    &set_label("_aesni_${p}rypt6_enter");
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	eval"&aes${p}	($inout0,$rndkey0)";
	eval"&aes${p}	($inout1,$rndkey0)";
	eval"&aes${p}	($inout2,$rndkey0)";
	eval"&aes${p}	($inout3,$rndkey0)";
	eval"&aes${p}	($inout4,$rndkey0)";
	eval"&aes${p}	($inout5,$rndkey0)";
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
    &jnz		(&label("${p}6_loop"));

    eval"&aes${p}	($inout0,$rndkey1)";
    eval"&aes${p}	($inout1,$rndkey1)";
    eval"&aes${p}	($inout2,$rndkey1)";
    eval"&aes${p}	($inout3,$rndkey1)";
    eval"&aes${p}	($inout4,$rndkey1)";
    eval"&aes${p}	($inout5,$rndkey1)";
    eval"&aes${p}last	($inout0,$rndkey0)";
    eval"&aes${p}last	($inout1,$rndkey0)";
    eval"&aes${p}last	($inout2,$rndkey0)";
    eval"&aes${p}last	($inout3,$rndkey0)";
    eval"&aes${p}last	($inout4,$rndkey0)";
    eval"&aes${p}last	($inout5,$rndkey0)";
    &ret();
    &function_end_B("_aesni_${p}rypt6");
}
&aesni_generate2("enc") if ($PREFIX eq "aesni");
&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
&aesni_generate6("enc") if ($PREFIX eq "aesni");
&aesni_generate6("dec");

if ($PREFIX eq "aesni") {
######################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
#                         size_t length, const AES_KEY *key,
#                         int enc);
&function_begin("aesni_ecb_encrypt");
	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));
	&mov	($rounds_,&wparam(4));
	&and	($len,-16);
	&jz	(&label("ecb_ret"));
	&mov	($rounds,&DWP(240,$key));
	&test	($rounds_,$rounds_);
	&jz	(&label("ecb_decrypt"));

	&mov	($key_,$key);		# backup $key
	&mov	($rounds_,$rounds);	# backup $rounds
	&cmp	($len,0x60);
	&jb	(&label("ecb_enc_tail"));

	&movdqu	($inout0,&QWP(0,$inp));
	&movdqu	($inout1,&QWP(0x10,$inp));
	&movdqu	($inout2,&QWP(0x20,$inp));
	&movdqu	($inout3,&QWP(0x30,$inp));
	&movdqu	($inout4,&QWP(0x40,$inp));
	&movdqu	($inout5,&QWP(0x50,$inp));
	&lea	($inp,&DWP(0x60,$inp));
	&sub	($len,0x60);
	&jmp	(&label("ecb_enc_loop6_enter"));

&set_label("ecb_enc_loop6",16);
	&movups	(&QWP(0,$out),$inout0);
	&movdqu	($inout0,&QWP(0,$inp));
	&movups	(&QWP(0x10,$out),$inout1);
	&movdqu	($inout1,&QWP(0x10,$inp));
	&movups	(&QWP(0x20,$out),$inout2);
	&movdqu	($inout2,&QWP(0x20,$inp));
	&movups	(&QWP(0x30,$out),$inout3);
	&movdqu	($inout3,&QWP(0x30,$inp));
	&movups	(&QWP(0x40,$out),$inout4);
	&movdqu	($inout4,&QWP(0x40,$inp));
	&movups	(&QWP(0x50,$out),$inout5);
	&lea	($out,&DWP(0x60,$out));
	&movdqu	($inout5,&QWP(0x50,$inp));
	&lea	($inp,&DWP(0x60,$inp));
&set_label("ecb_enc_loop6_enter");

	&call	("_aesni_encrypt6");

	&mov	($key,$key_);		# restore $key
	&mov	($rounds,$rounds_);	# restore $rounds
	&sub	($len,0x60);
	&jnc	(&label("ecb_enc_loop6"));

	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&movups	(&QWP(0x40,$out),$inout4);
	&movups	(&QWP(0x50,$out),$inout5);
	&lea	($out,&DWP(0x60,$out));
	&add	($len,0x60);
	&jz	(&label("ecb_ret"));

&set_label("ecb_enc_tail");
	&movups	($inout0,&QWP(0,$inp));
	&cmp	($len,0x20);
	&jb	(&label("ecb_enc_one"));
	&movups	($inout1,&QWP(0x10,$inp));
	&je	(&label("ecb_enc_two"));
	&movups	($inout2,&QWP(0x20,$inp));
	&cmp	($len,0x40);
	&jb	(&label("ecb_enc_three"));
	&movups	($inout3,&QWP(0x30,$inp));
	&je	(&label("ecb_enc_four"));
	&movups	($inout4,&QWP(0x40,$inp));
	&xorps	($inout5,$inout5);
	&call	("_aesni_encrypt6");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&movups	(&QWP(0x40,$out),$inout4);
	jmp	(&label("ecb_ret"));

&set_label("ecb_enc_one",16);
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&movups	(&QWP(0,$out),$inout0);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_enc_two",16);
	&call	("_aesni_encrypt2");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_enc_three",16);
	&call	("_aesni_encrypt3");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_enc_four",16);
	&call	("_aesni_encrypt4");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&jmp	(&label("ecb_ret"));
######################################################################
&set_label("ecb_decrypt",16);
	&mov	($key_,$key);		# backup $key
	&mov	($rounds_,$rounds);	# backup $rounds
	&cmp	($len,0x60);
	&jb	(&label("ecb_dec_tail"));

	&movdqu	($inout0,&QWP(0,$inp));
	&movdqu	($inout1,&QWP(0x10,$inp));
	&movdqu	($inout2,&QWP(0x20,$inp));
	&movdqu	($inout3,&QWP(0x30,$inp));
	&movdqu	($inout4,&QWP(0x40,$inp));
	&movdqu	($inout5,&QWP(0x50,$inp));
	&lea	($inp,&DWP(0x60,$inp));
	&sub	($len,0x60);
	&jmp	(&label("ecb_dec_loop6_enter"));

&set_label("ecb_dec_loop6",16);
	&movups	(&QWP(0,$out),$inout0);
	&movdqu	($inout0,&QWP(0,$inp));
	&movups	(&QWP(0x10,$out),$inout1);
	&movdqu	($inout1,&QWP(0x10,$inp));
	&movups	(&QWP(0x20,$out),$inout2);
	&movdqu	($inout2,&QWP(0x20,$inp));
	&movups	(&QWP(0x30,$out),$inout3);
	&movdqu	($inout3,&QWP(0x30,$inp));
	&movups	(&QWP(0x40,$out),$inout4);
	&movdqu	($inout4,&QWP(0x40,$inp));
	&movups	(&QWP(0x50,$out),$inout5);
	&lea	($out,&DWP(0x60,$out));
	&movdqu	($inout5,&QWP(0x50,$inp));
	&lea	($inp,&DWP(0x60,$inp));
&set_label("ecb_dec_loop6_enter");

	&call	("_aesni_decrypt6");

	&mov	($key,$key_);		# restore $key
	&mov	($rounds,$rounds_);	# restore $rounds
	&sub	($len,0x60);
	&jnc	(&label("ecb_dec_loop6"));

	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&movups	(&QWP(0x40,$out),$inout4);
	&movups	(&QWP(0x50,$out),$inout5);
	&lea	($out,&DWP(0x60,$out));
	&add	($len,0x60);
	&jz	(&label("ecb_ret"));

&set_label("ecb_dec_tail");
	&movups	($inout0,&QWP(0,$inp));
	&cmp	($len,0x20);
	&jb	(&label("ecb_dec_one"));
	&movups	($inout1,&QWP(0x10,$inp));
	&je	(&label("ecb_dec_two"));
	&movups	($inout2,&QWP(0x20,$inp));
	&cmp	($len,0x40);
	&jb	(&label("ecb_dec_three"));
	&movups	($inout3,&QWP(0x30,$inp));
	&je	(&label("ecb_dec_four"));
	&movups	($inout4,&QWP(0x40,$inp));
	&xorps	($inout5,$inout5);
	&call	("_aesni_decrypt6");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&movups	(&QWP(0x40,$out),$inout4);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_dec_one",16);
	if ($inline)
	{   &aesni_inline_generate1("dec");	}
	else
	{   &call	("_aesni_decrypt1");	}
	&movups	(&QWP(0,$out),$inout0);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_dec_two",16);
	&call	("_aesni_decrypt2");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_dec_three",16);
	&call	("_aesni_decrypt3");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&jmp	(&label("ecb_ret"));

&set_label("ecb_dec_four",16);
	&call	("_aesni_decrypt4");
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);

&set_label("ecb_ret");
	&pxor	("xmm0","xmm0");		# clear register bank
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&pxor	("xmm3","xmm3");
	&pxor	("xmm4","xmm4");
	&pxor	("xmm5","xmm5");
	&pxor	("xmm6","xmm6");
	&pxor	("xmm7","xmm7");
&function_end("aesni_ecb_encrypt");

######################################################################
# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
#                         size_t blocks, const AES_KEY *key,
#                         const char *ivec,char *cmac);
#
# Handles only complete blocks, operates on 64-bit counter and
# does not update *ivec! Nor does it finalize CMAC value
# (see engine/eng_aesni.c for details)
#
{ my $cmac=$inout1;
&function_begin("aesni_ccm64_encrypt_blocks");
	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));
	&mov	($rounds_,&wparam(4));
	&mov	($rounds,&wparam(5));
	&mov	($key_,"esp");
	&sub	("esp",60);
	&and	("esp",-16);			# align stack
	&mov	(&DWP(48,"esp"),$key_);

	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
	&mov	($rounds,&DWP(240,$key));

	# compose byte-swap control mask for pshufb on stack
	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
	&mov	(&DWP(4,"esp"),0x08090a0b);
	&mov	(&DWP(8,"esp"),0x04050607);
	&mov	(&DWP(12,"esp"),0x00010203);

	# compose counter increment vector on stack
	&mov	($rounds_,1);
	&xor	($key_,$key_);
	&mov	(&DWP(16,"esp"),$rounds_);
	&mov	(&DWP(20,"esp"),$key_);
	&mov	(&DWP(24,"esp"),$key_);
	&mov	(&DWP(28,"esp"),$key_);

	&shl	($rounds,4);
	&mov	($rounds_,16);
	&lea	($key_,&DWP(0,$key));
	&movdqa	($inout3,&QWP(0,"esp"));
	&movdqa	($inout0,$ivec);
	&lea	($key,&DWP(32,$key,$rounds));
	&sub	($rounds_,$rounds);
	&pshufb	($ivec,$inout3);

&set_label("ccm64_enc_outer");
	&$movekey	($rndkey0,&QWP(0,$key_));
	&mov		($rounds,$rounds_);
	&movups		($in0,&QWP(0,$inp));

	&xorps		($inout0,$rndkey0);
	&$movekey	($rndkey1,&QWP(16,$key_));
	&xorps		($rndkey0,$in0);
	&xorps		($cmac,$rndkey0);		# cmac^=inp
	&$movekey	($rndkey0,&QWP(32,$key_));

&set_label("ccm64_enc2_loop");
	&aesenc		($inout0,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	&aesenc		($inout0,$rndkey0);
	&aesenc		($cmac,$rndkey0);
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("ccm64_enc2_loop"));
	&aesenc		($inout0,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&paddq		($ivec,&QWP(16,"esp"));
	&dec		($len);
	&aesenclast	($inout0,$rndkey0);
	&aesenclast	($cmac,$rndkey0);

	&lea	($inp,&DWP(16,$inp));
	&xorps	($in0,$inout0);			# inp^=E(ivec)
	&movdqa	($inout0,$ivec);
	&movups	(&QWP(0,$out),$in0);		# save output
	&pshufb	($inout0,$inout3);
	&lea	($out,&DWP(16,$out));
	&jnz	(&label("ccm64_enc_outer"));

	&mov	("esp",&DWP(48,"esp"));
	&mov	($out,&wparam(5));
	&movups	(&QWP(0,$out),$cmac);

	&pxor	("xmm0","xmm0");		# clear register bank
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&pxor	("xmm3","xmm3");
	&pxor	("xmm4","xmm4");
	&pxor	("xmm5","xmm5");
	&pxor	("xmm6","xmm6");
	&pxor	("xmm7","xmm7");
&function_end("aesni_ccm64_encrypt_blocks");

&function_begin("aesni_ccm64_decrypt_blocks");
	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));
	&mov	($rounds_,&wparam(4));
	&mov	($rounds,&wparam(5));
	&mov	($key_,"esp");
	&sub	("esp",60);
	&and	("esp",-16);			# align stack
	&mov	(&DWP(48,"esp"),$key_);

	&movdqu	($ivec,&QWP(0,$rounds_));	# load ivec
	&movdqu	($cmac,&QWP(0,$rounds));	# load cmac
	&mov	($rounds,&DWP(240,$key));

	# compose byte-swap control mask for pshufb on stack
	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
	&mov	(&DWP(4,"esp"),0x08090a0b);
	&mov	(&DWP(8,"esp"),0x04050607);
	&mov	(&DWP(12,"esp"),0x00010203);

	# compose counter increment vector on stack
	&mov	($rounds_,1);
	&xor	($key_,$key_);
	&mov	(&DWP(16,"esp"),$rounds_);
	&mov	(&DWP(20,"esp"),$key_);
	&mov	(&DWP(24,"esp"),$key_);
	&mov	(&DWP(28,"esp"),$key_);

	&movdqa	($inout3,&QWP(0,"esp"));	# bswap mask
	&movdqa	($inout0,$ivec);

	&mov	($key_,$key);
	&mov	($rounds_,$rounds);

	&pshufb	($ivec,$inout3);
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&shl	($rounds_,4);
	&mov	($rounds,16);
	&movups	($in0,&QWP(0,$inp));		# load inp
	&paddq	($ivec,&QWP(16,"esp"));
	&lea	($inp,&QWP(16,$inp));
	&sub	($rounds,$rounds_);
	&lea	($key,&DWP(32,$key_,$rounds_));
	&mov	($rounds_,$rounds);
	&jmp	(&label("ccm64_dec_outer"));

&set_label("ccm64_dec_outer",16);
	&xorps	($in0,$inout0);			# inp ^= E(ivec)
	&movdqa	($inout0,$ivec);
	&movups	(&QWP(0,$out),$in0);		# save output
	&lea	($out,&DWP(16,$out));
	&pshufb	($inout0,$inout3);

	&sub	($len,1);
	&jz	(&label("ccm64_dec_break"));

	&$movekey	($rndkey0,&QWP(0,$key_));
	&mov		($rounds,$rounds_);
	&$movekey	($rndkey1,&QWP(16,$key_));
	&xorps		($in0,$rndkey0);
	&xorps		($inout0,$rndkey0);
	&xorps		($cmac,$in0);		# cmac^=out
	&$movekey	($rndkey0,&QWP(32,$key_));

&set_label("ccm64_dec2_loop");
	&aesenc		($inout0,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&$movekey	($rndkey1,&QWP(0,$key,$rounds));
	&add		($rounds,32);
	&aesenc		($inout0,$rndkey0);
	&aesenc		($cmac,$rndkey0);
	&$movekey	($rndkey0,&QWP(-16,$key,$rounds));
	&jnz		(&label("ccm64_dec2_loop"));
	&movups		($in0,&QWP(0,$inp));	# load inp
	&paddq		($ivec,&QWP(16,"esp"));
	&aesenc		($inout0,$rndkey1);
	&aesenc		($cmac,$rndkey1);
	&aesenclast	($inout0,$rndkey0);
	&aesenclast	($cmac,$rndkey0);
	&lea		($inp,&QWP(16,$inp));
	&jmp	(&label("ccm64_dec_outer"));

&set_label("ccm64_dec_break",16);
	&mov	($rounds,&DWP(240,$key_));
	&mov	($key,$key_);
	if ($inline)
	{   &aesni_inline_generate1("enc",$cmac,$in0);	}
	else
	{   &call	("_aesni_encrypt1",$cmac);	}

	&mov	("esp",&DWP(48,"esp"));
	&mov	($out,&wparam(5));
	&movups	(&QWP(0,$out),$cmac);

	&pxor	("xmm0","xmm0");		# clear register bank
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&pxor	("xmm3","xmm3");
	&pxor	("xmm4","xmm4");
	&pxor	("xmm5","xmm5");
	&pxor	("xmm6","xmm6");
	&pxor	("xmm7","xmm7");
&function_end("aesni_ccm64_decrypt_blocks");
}

######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
#                         size_t blocks, const AES_KEY *key,
#                         const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# stack layout:
#	0	pshufb mask
#	16	vector addend: 0,6,6,6
# 	32	counter-less ivec
#	48	1st triplet of counter vector
#	64	2nd triplet of counter vector
#	80	saved %esp

&function_begin("aesni_ctr32_encrypt_blocks");
	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));
	&mov	($rounds_,&wparam(4));
	&mov	($key_,"esp");
	&sub	("esp",88);
	&and	("esp",-16);			# align stack
	&mov	(&DWP(80,"esp"),$key_);

	&cmp	($len,1);
	&je	(&label("ctr32_one_shortcut"));

	&movdqu	($inout5,&QWP(0,$rounds_));	# load ivec

	# compose byte-swap control mask for pshufb on stack
	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
	&mov	(&DWP(4,"esp"),0x08090a0b);
	&mov	(&DWP(8,"esp"),0x04050607);
	&mov	(&DWP(12,"esp"),0x00010203);

	# compose counter increment vector on stack
	&mov	($rounds,6);
	&xor	($key_,$key_);
	&mov	(&DWP(16,"esp"),$rounds);
	&mov	(&DWP(20,"esp"),$rounds);
	&mov	(&DWP(24,"esp"),$rounds);
	&mov	(&DWP(28,"esp"),$key_);

	&pextrd	($rounds_,$inout5,3);		# pull 32-bit counter
	&pinsrd	($inout5,$key_,3);		# wipe 32-bit counter

	&mov	($rounds,&DWP(240,$key));	# key->rounds

	# compose 2 vectors of 3x32-bit counters
	&bswap	($rounds_);
	&pxor	($rndkey0,$rndkey0);
	&pxor	($rndkey1,$rndkey1);
	&movdqa	($inout0,&QWP(0,"esp"));	# load byte-swap mask
	&pinsrd	($rndkey0,$rounds_,0);
	&lea	($key_,&DWP(3,$rounds_));
	&pinsrd	($rndkey1,$key_,0);
	&inc	($rounds_);
	&pinsrd	($rndkey0,$rounds_,1);
	&inc	($key_);
	&pinsrd	($rndkey1,$key_,1);
	&inc	($rounds_);
	&pinsrd	($rndkey0,$rounds_,2);
	&inc	($key_);
	&pinsrd	($rndkey1,$key_,2);
	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
	&pshufb	($rndkey0,$inout0);		# byte swap
	&movdqu	($inout4,&QWP(0,$key));		# key[0]
	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
	&pshufb	($rndkey1,$inout0);		# byte swap

	&pshufd	($inout0,$rndkey0,3<<6);	# place counter to upper dword
	&pshufd	($inout1,$rndkey0,2<<6);
	&cmp	($len,6);
	&jb	(&label("ctr32_tail"));
	&pxor	($inout5,$inout4);		# counter-less ivec^key[0]
	&shl	($rounds,4);
	&mov	($rounds_,16);
	&movdqa	(&QWP(32,"esp"),$inout5);	# save counter-less ivec^key[0]
	&mov	($key_,$key);			# backup $key
	&sub	($rounds_,$rounds);		# backup twisted $rounds
	&lea	($key,&DWP(32,$key,$rounds));
	&sub	($len,6);
	&jmp	(&label("ctr32_loop6"));

&set_label("ctr32_loop6",16);
	# inlining _aesni_encrypt6's prologue gives ~6% improvement...
	&pshufd	($inout2,$rndkey0,1<<6);
	&movdqa	($rndkey0,&QWP(32,"esp"));	# pull counter-less ivec
	&pshufd	($inout3,$rndkey1,3<<6);
	&pxor		($inout0,$rndkey0);	# merge counter-less ivec
	&pshufd	($inout4,$rndkey1,2<<6);
	&pxor		($inout1,$rndkey0);
	&pshufd	($inout5,$rndkey1,1<<6);
	&$movekey	($rndkey1,&QWP(16,$key_));
	&pxor		($inout2,$rndkey0);
	&pxor		($inout3,$rndkey0);
	&aesenc		($inout0,$rndkey1);
	&pxor		($inout4,$rndkey0);
	&pxor		($inout5,$rndkey0);
	&aesenc		($inout1,$rndkey1);
	&$movekey	($rndkey0,&QWP(32,$key_));
	&mov		($rounds,$rounds_);
	&aesenc		($inout2,$rndkey1);
	&aesenc		($inout3,$rndkey1);
	&aesenc		($inout4,$rndkey1);
	&aesenc		($inout5,$rndkey1);

	&call		(&label("_aesni_encrypt6_enter"));

	&movups	($rndkey1,&QWP(0,$inp));
	&movups	($rndkey0,&QWP(0x10,$inp));
	&xorps	($inout0,$rndkey1);
	&movups	($rndkey1,&QWP(0x20,$inp));
	&xorps	($inout1,$rndkey0);
	&movups	(&QWP(0,$out),$inout0);
	&movdqa	($rndkey0,&QWP(16,"esp"));	# load increment
	&xorps	($inout2,$rndkey1);
	&movdqa	($rndkey1,&QWP(64,"esp"));	# load 2nd triplet
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);

	&paddd	($rndkey1,$rndkey0);		# 2nd triplet increment
	&paddd	($rndkey0,&QWP(48,"esp"));	# 1st triplet increment
	&movdqa	($inout0,&QWP(0,"esp"));	# load byte swap mask

	&movups	($inout1,&QWP(0x30,$inp));
	&movups	($inout2,&QWP(0x40,$inp));
	&xorps	($inout3,$inout1);
	&movups	($inout1,&QWP(0x50,$inp));
	&lea	($inp,&DWP(0x60,$inp));
	&movdqa	(&QWP(48,"esp"),$rndkey0);	# save 1st triplet
	&pshufb	($rndkey0,$inout0);		# byte swap
	&xorps	($inout4,$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&xorps	($inout5,$inout1);
	&movdqa	(&QWP(64,"esp"),$rndkey1);	# save 2nd triplet
	&pshufb	($rndkey1,$inout0);		# byte swap
	&movups	(&QWP(0x40,$out),$inout4);
	&pshufd	($inout0,$rndkey0,3<<6);
	&movups	(&QWP(0x50,$out),$inout5);
	&lea	($out,&DWP(0x60,$out));

	&pshufd	($inout1,$rndkey0,2<<6);
	&sub	($len,6);
	&jnc	(&label("ctr32_loop6"));

	&add	($len,6);
	&jz	(&label("ctr32_ret"));
	&movdqu	($inout5,&QWP(0,$key_));
	&mov	($key,$key_);
	&pxor	($inout5,&QWP(32,"esp"));	# restore count-less ivec
	&mov	($rounds,&DWP(240,$key_));	# restore $rounds

&set_label("ctr32_tail");
	&por	($inout0,$inout5);
	&cmp	($len,2);
	&jb	(&label("ctr32_one"));

	&pshufd	($inout2,$rndkey0,1<<6);
	&por	($inout1,$inout5);
	&je	(&label("ctr32_two"));

	&pshufd	($inout3,$rndkey1,3<<6);
	&por	($inout2,$inout5);
	&cmp	($len,4);
	&jb	(&label("ctr32_three"));

	&pshufd	($inout4,$rndkey1,2<<6);
	&por	($inout3,$inout5);
	&je	(&label("ctr32_four"));

	&por	($inout4,$inout5);
	&call	("_aesni_encrypt6");
	&movups	($rndkey1,&QWP(0,$inp));
	&movups	($rndkey0,&QWP(0x10,$inp));
	&xorps	($inout0,$rndkey1);
	&movups	($rndkey1,&QWP(0x20,$inp));
	&xorps	($inout1,$rndkey0);
	&movups	($rndkey0,&QWP(0x30,$inp));
	&xorps	($inout2,$rndkey1);
	&movups	($rndkey1,&QWP(0x40,$inp));
	&xorps	($inout3,$rndkey0);
	&movups	(&QWP(0,$out),$inout0);
	&xorps	($inout4,$rndkey1);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&movups	(&QWP(0x40,$out),$inout4);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_one_shortcut",16);
	&movups	($inout0,&QWP(0,$rounds_));	# load ivec
	&mov	($rounds,&DWP(240,$key));
	
&set_label("ctr32_one");
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&movups	($in0,&QWP(0,$inp));
	&xorps	($in0,$inout0);
	&movups	(&QWP(0,$out),$in0);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_two",16);
	&call	("_aesni_encrypt2");
	&movups	($inout3,&QWP(0,$inp));
	&movups	($inout4,&QWP(0x10,$inp));
	&xorps	($inout0,$inout3);
	&xorps	($inout1,$inout4);
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_three",16);
	&call	("_aesni_encrypt3");
	&movups	($inout3,&QWP(0,$inp));
	&movups	($inout4,&QWP(0x10,$inp));
	&xorps	($inout0,$inout3);
	&movups	($inout5,&QWP(0x20,$inp));
	&xorps	($inout1,$inout4);
	&movups	(&QWP(0,$out),$inout0);
	&xorps	($inout2,$inout5);
	&movups	(&QWP(0x10,$out),$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&jmp	(&label("ctr32_ret"));

&set_label("ctr32_four",16);
	&call	("_aesni_encrypt4");
	&movups	($inout4,&QWP(0,$inp));
	&movups	($inout5,&QWP(0x10,$inp));
	&movups	($rndkey1,&QWP(0x20,$inp));
	&xorps	($inout0,$inout4);
	&movups	($rndkey0,&QWP(0x30,$inp));
	&xorps	($inout1,$inout5);
	&movups	(&QWP(0,$out),$inout0);
	&xorps	($inout2,$rndkey1);
	&movups	(&QWP(0x10,$out),$inout1);
	&xorps	($inout3,$rndkey0);
	&movups	(&QWP(0x20,$out),$inout2);
	&movups	(&QWP(0x30,$out),$inout3);

&set_label("ctr32_ret");
	&pxor	("xmm0","xmm0");		# clear register bank
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&pxor	("xmm3","xmm3");
	&pxor	("xmm4","xmm4");
	&movdqa	(&QWP(32,"esp"),"xmm0");	# clear stack
	&pxor	("xmm5","xmm5");
	&movdqa	(&QWP(48,"esp"),"xmm0");
	&pxor	("xmm6","xmm6");
	&movdqa	(&QWP(64,"esp"),"xmm0");
	&pxor	("xmm7","xmm7");
	&mov	("esp",&DWP(80,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");

######################################################################
# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
#	const AES_KEY *key1, const AES_KEY *key2
#	const unsigned char iv[16]);
#
{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);

&function_begin("aesni_xts_encrypt");
	&mov	($key,&wparam(4));		# key2
	&mov	($inp,&wparam(5));		# clear-text tweak

	&mov	($rounds,&DWP(240,$key));	# key2->rounds
	&movups	($inout0,&QWP(0,$inp));
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}

	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));		# key1

	&mov	($key_,"esp");
	&sub	("esp",16*7+8);
	&mov	($rounds,&DWP(240,$key));	# key1->rounds
	&and	("esp",-16);			# align stack

	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
	&mov	(&DWP(16*6+4,"esp"),0);
	&mov	(&DWP(16*6+8,"esp"),1);
	&mov	(&DWP(16*6+12,"esp"),0);
	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp

	&movdqa	($tweak,$inout0);
	&pxor	($twtmp,$twtmp);
	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits

	&and	($len,-16);
	&mov	($key_,$key);			# backup $key
	&mov	($rounds_,$rounds);		# backup $rounds
	&sub	($len,16*6);
	&jc	(&label("xts_enc_short"));

	&shl	($rounds,4);
	&mov	($rounds_,16);
	&sub	($rounds_,$rounds);
	&lea	($key,&DWP(32,$key,$rounds));
	&jmp	(&label("xts_enc_loop6"));

&set_label("xts_enc_loop6",16);
	for ($i=0;$i<4;$i++) {
	    &pshufd	($twres,$twtmp,0x13);
	    &pxor	($twtmp,$twtmp);
	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
	    &pand	($twres,$twmask);	# isolate carry and residue
	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
	    &pxor	($tweak,$twres);
	}
	&pshufd	($inout5,$twtmp,0x13);
	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	 &$movekey	($rndkey0,&QWP(0,$key_));
	&pand	($inout5,$twmask);		# isolate carry and residue
	 &movups	($inout0,&QWP(0,$inp));	# load input
	&pxor	($inout5,$tweak);

	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
	&mov	($rounds,$rounds_);		# restore $rounds
	&movdqu	($inout1,&QWP(16*1,$inp));
	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
	&movdqu	($inout2,&QWP(16*2,$inp));
	 &pxor		($inout1,$rndkey0);
	&movdqu	($inout3,&QWP(16*3,$inp));
	 &pxor		($inout2,$rndkey0);
	&movdqu	($inout4,&QWP(16*4,$inp));
	 &pxor		($inout3,$rndkey0);
	&movdqu	($rndkey1,&QWP(16*5,$inp));
	 &pxor		($inout4,$rndkey0);
	&lea	($inp,&DWP(16*6,$inp));
	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
	&pxor	($inout5,$rndkey1);

	 &$movekey	($rndkey1,&QWP(16,$key_));
	&pxor	($inout1,&QWP(16*1,"esp"));
	&pxor	($inout2,&QWP(16*2,"esp"));
	 &aesenc	($inout0,$rndkey1);
	&pxor	($inout3,&QWP(16*3,"esp"));
	&pxor	($inout4,&QWP(16*4,"esp"));
	 &aesenc	($inout1,$rndkey1);
	&pxor		($inout5,$rndkey0);
	 &$movekey	($rndkey0,&QWP(32,$key_));
	 &aesenc	($inout2,$rndkey1);
	 &aesenc	($inout3,$rndkey1);
	 &aesenc	($inout4,$rndkey1);
	 &aesenc	($inout5,$rndkey1);
	&call		(&label("_aesni_encrypt6_enter"));

	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
       &pxor	($twtmp,$twtmp);
	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
	&xorps	($inout1,&QWP(16*1,"esp"));
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&xorps	($inout2,&QWP(16*2,"esp"));
	&movups	(&QWP(16*1,$out),$inout1);
	&xorps	($inout3,&QWP(16*3,"esp"));
	&movups	(&QWP(16*2,$out),$inout2);
	&xorps	($inout4,&QWP(16*4,"esp"));
	&movups	(&QWP(16*3,$out),$inout3);
	&xorps	($inout5,$tweak);
	&movups	(&QWP(16*4,$out),$inout4);
       &pshufd	($twres,$twtmp,0x13);
	&movups	(&QWP(16*5,$out),$inout5);
	&lea	($out,&DWP(16*6,$out));
       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87

	&pxor	($twtmp,$twtmp);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);

	&sub	($len,16*6);
	&jnc	(&label("xts_enc_loop6"));

	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
	&mov	($key,$key_);			# restore $key
	&mov	($rounds_,$rounds);

&set_label("xts_enc_short");
	&add	($len,16*6);
	&jz	(&label("xts_enc_done6x"));

	&movdqa	($inout3,$tweak);		# put aside previous tweak
	&cmp	($len,0x20);
	&jb	(&label("xts_enc_one"));

	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);
	&je	(&label("xts_enc_two"));

	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&movdqa	($inout4,$tweak);		# put aside previous tweak
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);
	&cmp	($len,0x40);
	&jb	(&label("xts_enc_three"));

	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&movdqa	($inout5,$tweak);		# put aside previous tweak
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);
	&movdqa	(&QWP(16*0,"esp"),$inout3);
	&movdqa	(&QWP(16*1,"esp"),$inout4);
	&je	(&label("xts_enc_four"));

	&movdqa	(&QWP(16*2,"esp"),$inout5);
	&pshufd	($inout5,$twtmp,0x13);
	&movdqa	(&QWP(16*3,"esp"),$tweak);
	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
	&pand	($inout5,$twmask);		# isolate carry and residue
	&pxor	($inout5,$tweak);

	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
	&movdqu	($inout1,&QWP(16*1,$inp));
	&movdqu	($inout2,&QWP(16*2,$inp));
	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
	&movdqu	($inout3,&QWP(16*3,$inp));
	&pxor	($inout1,&QWP(16*1,"esp"));
	&movdqu	($inout4,&QWP(16*4,$inp));
	&pxor	($inout2,&QWP(16*2,"esp"));
	&lea	($inp,&DWP(16*5,$inp));
	&pxor	($inout3,&QWP(16*3,"esp"));
	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
	&pxor	($inout4,$inout5);

	&call	("_aesni_encrypt6");

	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
	&xorps	($inout1,&QWP(16*1,"esp"));
	&xorps	($inout2,&QWP(16*2,"esp"));
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&xorps	($inout3,&QWP(16*3,"esp"));
	&movups	(&QWP(16*1,$out),$inout1);
	&xorps	($inout4,$tweak);
	&movups	(&QWP(16*2,$out),$inout2);
	&movups	(&QWP(16*3,$out),$inout3);
	&movups	(&QWP(16*4,$out),$inout4);
	&lea	($out,&DWP(16*5,$out));
	&jmp	(&label("xts_enc_done"));

&set_label("xts_enc_one",16);
	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&lea	($inp,&DWP(16*1,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&xorps	($inout0,$inout3);		# output^=tweak
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&lea	($out,&DWP(16*1,$out));

	&movdqa	($tweak,$inout3);		# last tweak
	&jmp	(&label("xts_enc_done"));

&set_label("xts_enc_two",16);
	&movaps	($inout4,$tweak);		# put aside last tweak

	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&movups	($inout1,&QWP(16*1,$inp));
	&lea	($inp,&DWP(16*2,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	&xorps	($inout1,$inout4);

	&call	("_aesni_encrypt2");

	&xorps	($inout0,$inout3);		# output^=tweak
	&xorps	($inout1,$inout4);
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&movups	(&QWP(16*1,$out),$inout1);
	&lea	($out,&DWP(16*2,$out));

	&movdqa	($tweak,$inout4);		# last tweak
	&jmp	(&label("xts_enc_done"));

&set_label("xts_enc_three",16);
	&movaps	($inout5,$tweak);		# put aside last tweak
	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&movups	($inout1,&QWP(16*1,$inp));
	&movups	($inout2,&QWP(16*2,$inp));
	&lea	($inp,&DWP(16*3,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	&xorps	($inout1,$inout4);
	&xorps	($inout2,$inout5);

	&call	("_aesni_encrypt3");

	&xorps	($inout0,$inout3);		# output^=tweak
	&xorps	($inout1,$inout4);
	&xorps	($inout2,$inout5);
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&movups	(&QWP(16*1,$out),$inout1);
	&movups	(&QWP(16*2,$out),$inout2);
	&lea	($out,&DWP(16*3,$out));

	&movdqa	($tweak,$inout5);		# last tweak
	&jmp	(&label("xts_enc_done"));

&set_label("xts_enc_four",16);
	&movaps	($inout4,$tweak);		# put aside last tweak

	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&movups	($inout1,&QWP(16*1,$inp));
	&movups	($inout2,&QWP(16*2,$inp));
	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
	&movups	($inout3,&QWP(16*3,$inp));
	&lea	($inp,&DWP(16*4,$inp));
	&xorps	($inout1,&QWP(16*1,"esp"));
	&xorps	($inout2,$inout5);
	&xorps	($inout3,$inout4);

	&call	("_aesni_encrypt4");

	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
	&xorps	($inout1,&QWP(16*1,"esp"));
	&xorps	($inout2,$inout5);
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&xorps	($inout3,$inout4);
	&movups	(&QWP(16*1,$out),$inout1);
	&movups	(&QWP(16*2,$out),$inout2);
	&movups	(&QWP(16*3,$out),$inout3);
	&lea	($out,&DWP(16*4,$out));

	&movdqa	($tweak,$inout4);		# last tweak
	&jmp	(&label("xts_enc_done"));

&set_label("xts_enc_done6x",16);		# $tweak is pre-calculated
	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
	&and	($len,15);
	&jz	(&label("xts_enc_ret"));
	&movdqa	($inout3,$tweak);
	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
	&jmp	(&label("xts_enc_steal"));

&set_label("xts_enc_done",16);
	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
	&pxor	($twtmp,$twtmp);
	&and	($len,15);
	&jz	(&label("xts_enc_ret"));

	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
	&pshufd	($inout3,$twtmp,0x13);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($inout3,&QWP(16*6,"esp"));	# isolate carry and residue
	&pxor	($inout3,$tweak);

&set_label("xts_enc_steal");
	&movz	($rounds,&BP(0,$inp));
	&movz	($key,&BP(-16,$out));
	&lea	($inp,&DWP(1,$inp));
	&mov	(&BP(-16,$out),&LB($rounds));
	&mov	(&BP(0,$out),&LB($key));
	&lea	($out,&DWP(1,$out));
	&sub	($len,1);
	&jnz	(&label("xts_enc_steal"));

	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
	&mov	($key,$key_);			# restore $key
	&mov	($rounds,$rounds_);		# restore $rounds

	&movups	($inout0,&QWP(-16,$out));	# load input
	&xorps	($inout0,$inout3);		# input^=tweak
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}
	&xorps	($inout0,$inout3);		# output^=tweak
	&movups	(&QWP(-16,$out),$inout0);	# write output

&set_label("xts_enc_ret");
	&pxor	("xmm0","xmm0");		# clear register bank
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
	&pxor	("xmm3","xmm3");
	&movdqa	(&QWP(16*1,"esp"),"xmm0");
	&pxor	("xmm4","xmm4");
	&movdqa	(&QWP(16*2,"esp"),"xmm0");
	&pxor	("xmm5","xmm5");
	&movdqa	(&QWP(16*3,"esp"),"xmm0");
	&pxor	("xmm6","xmm6");
	&movdqa	(&QWP(16*4,"esp"),"xmm0");
	&pxor	("xmm7","xmm7");
	&movdqa	(&QWP(16*5,"esp"),"xmm0");
	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
&function_end("aesni_xts_encrypt");

&function_begin("aesni_xts_decrypt");
	&mov	($key,&wparam(4));		# key2
	&mov	($inp,&wparam(5));		# clear-text tweak

	&mov	($rounds,&DWP(240,$key));	# key2->rounds
	&movups	($inout0,&QWP(0,$inp));
	if ($inline)
	{   &aesni_inline_generate1("enc");	}
	else
	{   &call	("_aesni_encrypt1");	}

	&mov	($inp,&wparam(0));
	&mov	($out,&wparam(1));
	&mov	($len,&wparam(2));
	&mov	($key,&wparam(3));		# key1

	&mov	($key_,"esp");
	&sub	("esp",16*7+8);
	&and	("esp",-16);			# align stack

	&xor	($rounds_,$rounds_);		# if(len%16) len-=16;
	&test	($len,15);
	&setnz	(&LB($rounds_));
	&shl	($rounds_,4);
	&sub	($len,$rounds_);

	&mov	(&DWP(16*6+0,"esp"),0x87);	# compose the magic constant
	&mov	(&DWP(16*6+4,"esp"),0);
	&mov	(&DWP(16*6+8,"esp"),1);
	&mov	(&DWP(16*6+12,"esp"),0);
	&mov	(&DWP(16*7+0,"esp"),$len);	# save original $len
	&mov	(&DWP(16*7+4,"esp"),$key_);	# save original %esp

	&mov	($rounds,&DWP(240,$key));	# key1->rounds
	&mov	($key_,$key);			# backup $key
	&mov	($rounds_,$rounds);		# backup $rounds

	&movdqa	($tweak,$inout0);
	&pxor	($twtmp,$twtmp);
	&movdqa	($twmask,&QWP(6*16,"esp"));	# 0x0...010...87
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits

	&and	($len,-16);
	&sub	($len,16*6);
	&jc	(&label("xts_dec_short"));

	&shl	($rounds,4);
	&mov	($rounds_,16);
	&sub	($rounds_,$rounds);
	&lea	($key,&DWP(32,$key,$rounds));
	&jmp	(&label("xts_dec_loop6"));

&set_label("xts_dec_loop6",16);
	for ($i=0;$i<4;$i++) {
	    &pshufd	($twres,$twtmp,0x13);
	    &pxor	($twtmp,$twtmp);
	    &movdqa	(&QWP(16*$i,"esp"),$tweak);
	    &paddq	($tweak,$tweak);	# &psllq($tweak,1);
	    &pand	($twres,$twmask);	# isolate carry and residue
	    &pcmpgtd	($twtmp,$tweak);	# broadcast upper bits
	    &pxor	($tweak,$twres);
	}
	&pshufd	($inout5,$twtmp,0x13);
	&movdqa	(&QWP(16*$i++,"esp"),$tweak);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	 &$movekey	($rndkey0,&QWP(0,$key_));
	&pand	($inout5,$twmask);		# isolate carry and residue
	 &movups	($inout0,&QWP(0,$inp));	# load input
	&pxor	($inout5,$tweak);

	# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
	&mov	($rounds,$rounds_);
	&movdqu	($inout1,&QWP(16*1,$inp));
	 &xorps		($inout0,$rndkey0);	# input^=rndkey[0]
	&movdqu	($inout2,&QWP(16*2,$inp));
	 &pxor		($inout1,$rndkey0);
	&movdqu	($inout3,&QWP(16*3,$inp));
	 &pxor		($inout2,$rndkey0);
	&movdqu	($inout4,&QWP(16*4,$inp));
	 &pxor		($inout3,$rndkey0);
	&movdqu	($rndkey1,&QWP(16*5,$inp));
	 &pxor		($inout4,$rndkey0);
	&lea	($inp,&DWP(16*6,$inp));
	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
	&movdqa	(&QWP(16*$i,"esp"),$inout5);	# save last tweak
	&pxor	($inout5,$rndkey1);

	 &$movekey	($rndkey1,&QWP(16,$key_));
	&pxor	($inout1,&QWP(16*1,"esp"));
	&pxor	($inout2,&QWP(16*2,"esp"));
	 &aesdec	($inout0,$rndkey1);
	&pxor	($inout3,&QWP(16*3,"esp"));
	&pxor	($inout4,&QWP(16*4,"esp"));
	 &aesdec	($inout1,$rndkey1);
	&pxor		($inout5,$rndkey0);
	 &$movekey	($rndkey0,&QWP(32,$key_));
	 &aesdec	($inout2,$rndkey1);
	 &aesdec	($inout3,$rndkey1);
	 &aesdec	($inout4,$rndkey1);
	 &aesdec	($inout5,$rndkey1);
	&call		(&label("_aesni_decrypt6_enter"));

	&movdqa	($tweak,&QWP(16*5,"esp"));	# last tweak
       &pxor	($twtmp,$twtmp);
	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
       &pcmpgtd	($twtmp,$tweak);		# broadcast upper bits
	&xorps	($inout1,&QWP(16*1,"esp"));
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&xorps	($inout2,&QWP(16*2,"esp"));
	&movups	(&QWP(16*1,$out),$inout1);
	&xorps	($inout3,&QWP(16*3,"esp"));
	&movups	(&QWP(16*2,$out),$inout2);
	&xorps	($inout4,&QWP(16*4,"esp"));
	&movups	(&QWP(16*3,$out),$inout3);
	&xorps	($inout5,$tweak);
	&movups	(&QWP(16*4,$out),$inout4);
       &pshufd	($twres,$twtmp,0x13);
	&movups	(&QWP(16*5,$out),$inout5);
	&lea	($out,&DWP(16*6,$out));
       &movdqa	($twmask,&QWP(16*6,"esp"));	# 0x0...010...87

	&pxor	($twtmp,$twtmp);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);

	&sub	($len,16*6);
	&jnc	(&label("xts_dec_loop6"));

	&mov	($rounds,&DWP(240,$key_));	# restore $rounds
	&mov	($key,$key_);			# restore $key
	&mov	($rounds_,$rounds);

&set_label("xts_dec_short");
	&add	($len,16*6);
	&jz	(&label("xts_dec_done6x"));

	&movdqa	($inout3,$tweak);		# put aside previous tweak
	&cmp	($len,0x20);
	&jb	(&label("xts_dec_one"));

	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);
	&je	(&label("xts_dec_two"));

	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&movdqa	($inout4,$tweak);		# put aside previous tweak
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);
	&cmp	($len,0x40);
	&jb	(&label("xts_dec_three"));

	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&movdqa	($inout5,$tweak);		# put aside previous tweak
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);
	&movdqa	(&QWP(16*0,"esp"),$inout3);
	&movdqa	(&QWP(16*1,"esp"),$inout4);
	&je	(&label("xts_dec_four"));

	&movdqa	(&QWP(16*2,"esp"),$inout5);
	&pshufd	($inout5,$twtmp,0x13);
	&movdqa	(&QWP(16*3,"esp"),$tweak);
	&paddq	($tweak,$tweak);		# &psllq($inout0,1);
	&pand	($inout5,$twmask);		# isolate carry and residue
	&pxor	($inout5,$tweak);

	&movdqu	($inout0,&QWP(16*0,$inp));	# load input
	&movdqu	($inout1,&QWP(16*1,$inp));
	&movdqu	($inout2,&QWP(16*2,$inp));
	&pxor	($inout0,&QWP(16*0,"esp"));	# input^=tweak
	&movdqu	($inout3,&QWP(16*3,$inp));
	&pxor	($inout1,&QWP(16*1,"esp"));
	&movdqu	($inout4,&QWP(16*4,$inp));
	&pxor	($inout2,&QWP(16*2,"esp"));
	&lea	($inp,&DWP(16*5,$inp));
	&pxor	($inout3,&QWP(16*3,"esp"));
	&movdqa	(&QWP(16*4,"esp"),$inout5);	# save last tweak
	&pxor	($inout4,$inout5);

	&call	("_aesni_decrypt6");

	&movaps	($tweak,&QWP(16*4,"esp"));	# last tweak
	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
	&xorps	($inout1,&QWP(16*1,"esp"));
	&xorps	($inout2,&QWP(16*2,"esp"));
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&xorps	($inout3,&QWP(16*3,"esp"));
	&movups	(&QWP(16*1,$out),$inout1);
	&xorps	($inout4,$tweak);
	&movups	(&QWP(16*2,$out),$inout2);
	&movups	(&QWP(16*3,$out),$inout3);
	&movups	(&QWP(16*4,$out),$inout4);
	&lea	($out,&DWP(16*5,$out));
	&jmp	(&label("xts_dec_done"));

&set_label("xts_dec_one",16);
	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&lea	($inp,&DWP(16*1,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	if ($inline)
	{   &aesni_inline_generate1("dec");	}
	else
	{   &call	("_aesni_decrypt1");	}
	&xorps	($inout0,$inout3);		# output^=tweak
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&lea	($out,&DWP(16*1,$out));

	&movdqa	($tweak,$inout3);		# last tweak
	&jmp	(&label("xts_dec_done"));

&set_label("xts_dec_two",16);
	&movaps	($inout4,$tweak);		# put aside last tweak

	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&movups	($inout1,&QWP(16*1,$inp));
	&lea	($inp,&DWP(16*2,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	&xorps	($inout1,$inout4);

	&call	("_aesni_decrypt2");

	&xorps	($inout0,$inout3);		# output^=tweak
	&xorps	($inout1,$inout4);
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&movups	(&QWP(16*1,$out),$inout1);
	&lea	($out,&DWP(16*2,$out));

	&movdqa	($tweak,$inout4);		# last tweak
	&jmp	(&label("xts_dec_done"));

&set_label("xts_dec_three",16);
	&movaps	($inout5,$tweak);		# put aside last tweak
	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&movups	($inout1,&QWP(16*1,$inp));
	&movups	($inout2,&QWP(16*2,$inp));
	&lea	($inp,&DWP(16*3,$inp));
	&xorps	($inout0,$inout3);		# input^=tweak
	&xorps	($inout1,$inout4);
	&xorps	($inout2,$inout5);

	&call	("_aesni_decrypt3");

	&xorps	($inout0,$inout3);		# output^=tweak
	&xorps	($inout1,$inout4);
	&xorps	($inout2,$inout5);
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&movups	(&QWP(16*1,$out),$inout1);
	&movups	(&QWP(16*2,$out),$inout2);
	&lea	($out,&DWP(16*3,$out));

	&movdqa	($tweak,$inout5);		# last tweak
	&jmp	(&label("xts_dec_done"));

&set_label("xts_dec_four",16);
	&movaps	($inout4,$tweak);		# put aside last tweak

	&movups	($inout0,&QWP(16*0,$inp));	# load input
	&movups	($inout1,&QWP(16*1,$inp));
	&movups	($inout2,&QWP(16*2,$inp));
	&xorps	($inout0,&QWP(16*0,"esp"));	# input^=tweak
	&movups	($inout3,&QWP(16*3,$inp));
	&lea	($inp,&DWP(16*4,$inp));
	&xorps	($inout1,&QWP(16*1,"esp"));
	&xorps	($inout2,$inout5);
	&xorps	($inout3,$inout4);

	&call	("_aesni_decrypt4");

	&xorps	($inout0,&QWP(16*0,"esp"));	# output^=tweak
	&xorps	($inout1,&QWP(16*1,"esp"));
	&xorps	($inout2,$inout5);
	&movups	(&QWP(16*0,$out),$inout0);	# write output
	&xorps	($inout3,$inout4);
	&movups	(&QWP(16*1,$out),$inout1);
	&movups	(&QWP(16*2,$out),$inout2);
	&movups	(&QWP(16*3,$out),$inout3);
	&lea	($out,&DWP(16*4,$out));

	&movdqa	($tweak,$inout4);		# last tweak
	&jmp	(&label("xts_dec_done"));

&set_label("xts_dec_done6x",16);		# $tweak is pre-calculated
	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
	&and	($len,15);
	&jz	(&label("xts_dec_ret"));
	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
	&jmp	(&label("xts_dec_only_one_more"));

&set_label("xts_dec_done",16);
	&mov	($len,&DWP(16*7+0,"esp"));	# restore original $len
	&pxor	($twtmp,$twtmp);
	&and	($len,15);
	&jz	(&label("xts_dec_ret"));

	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&mov	(&DWP(16*7+0,"esp"),$len);	# save $len%16
	&pshufd	($twres,$twtmp,0x13);
	&pxor	($twtmp,$twtmp);
	&movdqa	($twmask,&QWP(16*6,"esp"));
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($twres,$twmask);		# isolate carry and residue
	&pcmpgtd($twtmp,$tweak);		# broadcast upper bits
	&pxor	($tweak,$twres);

&set_label("xts_dec_only_one_more");
	&pshufd	($inout3,$twtmp,0x13);
	&movdqa	($inout4,$tweak);		# put aside previous tweak
	&paddq	($tweak,$tweak);		# &psllq($tweak,1);
	&pand	($inout3,$twmask);		# isolate carry and residue
	&pxor	($inout3,$tweak);

	&mov	($key,$key_);			# restore $key
	&mov	($rounds,$rounds_);		# restore $rounds

	&movups	($inout0,&QWP(0,$inp));		# load input
	&xorps	($inout0,$inout3);		# input^=tweak
	if ($inline)
	{   &aesni_inline_generate1("dec");	}
	else
	{   &call	("_aesni_decrypt1");	}
	&xorps	($inout0,$inout3);		# output^=tweak
	&movups	(&QWP(0,$out),$inout0);		# write output

&set_label("xts_dec_steal");
	&movz	($rounds,&BP(16,$inp));
	&movz	($key,&BP(0,$out));
	&lea	($inp,&DWP(1,$inp));
	&mov	(&BP(0,$out),&LB($rounds));
	&mov	(&BP(16,$out),&LB($key));
	&lea	($out,&DWP(1,$out));
	&sub	($len,1);
	&jnz	(&label("xts_dec_steal"));

	&sub	($out,&DWP(16*7+0,"esp"));	# rewind $out
	&mov	($key,$key_);			# restore $key
	&mov	($rounds,$rounds_);		# restore $rounds

	&movups	($inout0,&QWP(0,$out));		# load input
	&xorps	($inout0,$inout4);		# input^=tweak
	if ($inline)
	{   &aesni_inline_generate1("dec");	}
	else
	{   &call	("_aesni_decrypt1");	}
	&xorps	($inout0,$inout4);		# output^=tweak
	&movups	(&QWP(0,$out),$inout0);		# write output

&set_label("xts_dec_ret");
	&pxor	("xmm0","xmm0");		# clear register bank
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&movdqa	(&QWP(16*0,"esp"),"xmm0");	# clear stack
	&pxor	("xmm3","xmm3");
	&movdqa	(&QWP(16*1,"esp"),"xmm0");
	&pxor	("xmm4","xmm4");
	&movdqa	(&QWP(16*2,"esp"),"xmm0");
	&pxor	("xmm5","xmm5");
	&movdqa	(&QWP(16*3,"esp"),"xmm0");
	&pxor	("xmm6","xmm6");
	&movdqa	(&QWP(16*4,"esp"),"xmm0");
	&pxor	("xmm7","xmm7");
	&movdqa	(&QWP(16*5,"esp"),"xmm0");
	&mov	("esp",&DWP(16*7+4,"esp"));	# restore %esp
&function_end("aesni_xts_decrypt");
}
}

######################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
#                           size_t length, const AES_KEY *key,
#                           unsigned char *ivp,const int enc);
&function_begin("${PREFIX}_cbc_encrypt");
	&mov	($inp,&wparam(0));
	&mov	($rounds_,"esp");
	&mov	($out,&wparam(1));
	&sub	($rounds_,24);
	&mov	($len,&wparam(2));
	&and	($rounds_,-16);
	&mov	($key,&wparam(3));
	&mov	($key_,&wparam(4));
	&test	($len,$len);
	&jz	(&label("cbc_abort"));

	&cmp	(&wparam(5),0);
	&xchg	($rounds_,"esp");		# alloca
	&movups	($ivec,&QWP(0,$key_));		# load IV
	&mov	($rounds,&DWP(240,$key));
	&mov	($key_,$key);			# backup $key
	&mov	(&DWP(16,"esp"),$rounds_);	# save original %esp
	&mov	($rounds_,$rounds);		# backup $rounds
	&je	(&label("cbc_decrypt"));

	&movaps	($inout0,$ivec);
	&cmp	($len,16);
	&jb	(&label("cbc_enc_tail"));
	&sub	($len,16);
	&jmp	(&label("cbc_enc_loop"));

&set_label("cbc_enc_loop",16);
	&movups	($ivec,&QWP(0,$inp));		# input actually
	&lea	($inp,&DWP(16,$inp));
	if ($inline)
	{   &aesni_inline_generate1("enc",$inout0,$ivec);	}
	else
	{   &xorps($inout0,$ivec); &call("_aesni_encrypt1");	}
	&mov	($rounds,$rounds_);	# restore $rounds
	&mov	($key,$key_);		# restore $key
	&movups	(&QWP(0,$out),$inout0);	# store output
	&lea	($out,&DWP(16,$out));
	&sub	($len,16);
	&jnc	(&label("cbc_enc_loop"));
	&add	($len,16);
	&jnz	(&label("cbc_enc_tail"));
	&movaps	($ivec,$inout0);
	&pxor	($inout0,$inout0);
	&jmp	(&label("cbc_ret"));

&set_label("cbc_enc_tail");
	&mov	("ecx",$len);		# zaps $rounds
	&data_word(0xA4F3F689);		# rep movsb
	&mov	("ecx",16);		# zero tail
	&sub	("ecx",$len);
	&xor	("eax","eax");		# zaps $len
	&data_word(0xAAF3F689);		# rep stosb
	&lea	($out,&DWP(-16,$out));	# rewind $out by 1 block
	&mov	($rounds,$rounds_);	# restore $rounds
	&mov	($inp,$out);		# $inp and $out are the same
	&mov	($key,$key_);		# restore $key
	&jmp	(&label("cbc_enc_loop"));
######################################################################
&set_label("cbc_decrypt",16);
	&cmp	($len,0x50);
	&jbe	(&label("cbc_dec_tail"));
	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
	&sub	($len,0x50);
	&jmp	(&label("cbc_dec_loop6_enter"));

&set_label("cbc_dec_loop6",16);
	&movaps	(&QWP(0,"esp"),$rndkey0);	# save IV
	&movups	(&QWP(0,$out),$inout5);
	&lea	($out,&DWP(0x10,$out));
&set_label("cbc_dec_loop6_enter");
	&movdqu	($inout0,&QWP(0,$inp));
	&movdqu	($inout1,&QWP(0x10,$inp));
	&movdqu	($inout2,&QWP(0x20,$inp));
	&movdqu	($inout3,&QWP(0x30,$inp));
	&movdqu	($inout4,&QWP(0x40,$inp));
	&movdqu	($inout5,&QWP(0x50,$inp));

	&call	("_aesni_decrypt6");

	&movups	($rndkey1,&QWP(0,$inp));
	&movups	($rndkey0,&QWP(0x10,$inp));
	&xorps	($inout0,&QWP(0,"esp"));	# ^=IV
	&xorps	($inout1,$rndkey1);
	&movups	($rndkey1,&QWP(0x20,$inp));
	&xorps	($inout2,$rndkey0);
	&movups	($rndkey0,&QWP(0x30,$inp));
	&xorps	($inout3,$rndkey1);
	&movups	($rndkey1,&QWP(0x40,$inp));
	&xorps	($inout4,$rndkey0);
	&movups	($rndkey0,&QWP(0x50,$inp));	# IV
	&xorps	($inout5,$rndkey1);
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&lea	($inp,&DWP(0x60,$inp));
	&movups	(&QWP(0x20,$out),$inout2);
	&mov	($rounds,$rounds_);		# restore $rounds
	&movups	(&QWP(0x30,$out),$inout3);
	&mov	($key,$key_);			# restore $key
	&movups	(&QWP(0x40,$out),$inout4);
	&lea	($out,&DWP(0x50,$out));
	&sub	($len,0x60);
	&ja	(&label("cbc_dec_loop6"));

	&movaps	($inout0,$inout5);
	&movaps	($ivec,$rndkey0);
	&add	($len,0x50);
	&jle	(&label("cbc_dec_clear_tail_collected"));
	&movups	(&QWP(0,$out),$inout0);
	&lea	($out,&DWP(0x10,$out));
&set_label("cbc_dec_tail");
	&movups	($inout0,&QWP(0,$inp));
	&movaps	($in0,$inout0);
	&cmp	($len,0x10);
	&jbe	(&label("cbc_dec_one"));

	&movups	($inout1,&QWP(0x10,$inp));
	&movaps	($in1,$inout1);
	&cmp	($len,0x20);
	&jbe	(&label("cbc_dec_two"));

	&movups	($inout2,&QWP(0x20,$inp));
	&cmp	($len,0x30);
	&jbe	(&label("cbc_dec_three"));

	&movups	($inout3,&QWP(0x30,$inp));
	&cmp	($len,0x40);
	&jbe	(&label("cbc_dec_four"));

	&movups	($inout4,&QWP(0x40,$inp));
	&movaps	(&QWP(0,"esp"),$ivec);		# save IV
	&movups	($inout0,&QWP(0,$inp));
	&xorps	($inout5,$inout5);
	&call	("_aesni_decrypt6");
	&movups	($rndkey1,&QWP(0,$inp));
	&movups	($rndkey0,&QWP(0x10,$inp));
	&xorps	($inout0,&QWP(0,"esp"));	# ^= IV
	&xorps	($inout1,$rndkey1);
	&movups	($rndkey1,&QWP(0x20,$inp));
	&xorps	($inout2,$rndkey0);
	&movups	($rndkey0,&QWP(0x30,$inp));
	&xorps	($inout3,$rndkey1);
	&movups	($ivec,&QWP(0x40,$inp));	# IV
	&xorps	($inout4,$rndkey0);
	&movups	(&QWP(0,$out),$inout0);
	&movups	(&QWP(0x10,$out),$inout1);
	&pxor	($inout1,$inout1);
	&movups	(&QWP(0x20,$out),$inout2);
	&pxor	($inout2,$inout2);
	&movups	(&QWP(0x30,$out),$inout3);
	&pxor	($inout3,$inout3);
	&lea	($out,&DWP(0x40,$out));
	&movaps	($inout0,$inout4);
	&pxor	($inout4,$inout4);
	&sub	($len,0x50);
	&jmp	(&label("cbc_dec_tail_collected"));

&set_label("cbc_dec_one",16);
	if ($inline)
	{   &aesni_inline_generate1("dec");	}
	else
	{   &call	("_aesni_decrypt1");	}
	&xorps	($inout0,$ivec);
	&movaps	($ivec,$in0);
	&sub	($len,0x10);
	&jmp	(&label("cbc_dec_tail_collected"));

&set_label("cbc_dec_two",16);
	&call	("_aesni_decrypt2");
	&xorps	($inout0,$ivec);
	&xorps	($inout1,$in0);
	&movups	(&QWP(0,$out),$inout0);
	&movaps	($inout0,$inout1);
	&pxor	($inout1,$inout1);
	&lea	($out,&DWP(0x10,$out));
	&movaps	($ivec,$in1);
	&sub	($len,0x20);
	&jmp	(&label("cbc_dec_tail_collected"));

&set_label("cbc_dec_three",16);
	&call	("_aesni_decrypt3");
	&xorps	($inout0,$ivec);
	&xorps	($inout1,$in0);
	&xorps	($inout2,$in1);
	&movups	(&QWP(0,$out),$inout0);
	&movaps	($inout0,$inout2);
	&pxor	($inout2,$inout2);
	&movups	(&QWP(0x10,$out),$inout1);
	&pxor	($inout1,$inout1);
	&lea	($out,&DWP(0x20,$out));
	&movups	($ivec,&QWP(0x20,$inp));
	&sub	($len,0x30);
	&jmp	(&label("cbc_dec_tail_collected"));

&set_label("cbc_dec_four",16);
	&call	("_aesni_decrypt4");
	&movups	($rndkey1,&QWP(0x10,$inp));
	&movups	($rndkey0,&QWP(0x20,$inp));
	&xorps	($inout0,$ivec);
	&movups	($ivec,&QWP(0x30,$inp));
	&xorps	($inout1,$in0);
	&movups	(&QWP(0,$out),$inout0);
	&xorps	($inout2,$rndkey1);
	&movups	(&QWP(0x10,$out),$inout1);
	&pxor	($inout1,$inout1);
	&xorps	($inout3,$rndkey0);
	&movups	(&QWP(0x20,$out),$inout2);
	&pxor	($inout2,$inout2);
	&lea	($out,&DWP(0x30,$out));
	&movaps	($inout0,$inout3);
	&pxor	($inout3,$inout3);
	&sub	($len,0x40);
	&jmp	(&label("cbc_dec_tail_collected"));

&set_label("cbc_dec_clear_tail_collected",16);
	&pxor	($inout1,$inout1);
	&pxor	($inout2,$inout2);
	&pxor	($inout3,$inout3);
	&pxor	($inout4,$inout4);
&set_label("cbc_dec_tail_collected");
	&and	($len,15);
	&jnz	(&label("cbc_dec_tail_partial"));
	&movups	(&QWP(0,$out),$inout0);
	&pxor	($rndkey0,$rndkey0);
	&jmp	(&label("cbc_ret"));

&set_label("cbc_dec_tail_partial",16);
	&movaps	(&QWP(0,"esp"),$inout0);
	&pxor	($rndkey0,$rndkey0);
	&mov	("ecx",16);
	&mov	($inp,"esp");
	&sub	("ecx",$len);
	&data_word(0xA4F3F689);		# rep movsb
	&movdqa	(&QWP(0,"esp"),$inout0);

&set_label("cbc_ret");
	&mov	("esp",&DWP(16,"esp"));	# pull original %esp
	&mov	($key_,&wparam(4));
	&pxor	($inout0,$inout0);
	&pxor	($rndkey1,$rndkey1);
	&movups	(&QWP(0,$key_),$ivec);	# output IV
	&pxor	($ivec,$ivec);
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");

######################################################################
# Mechanical port from aesni-x86_64.pl.
#
# _aesni_set_encrypt_key is private interface,
# input:
#	"eax"	const unsigned char *userKey
#	$rounds	int bits
#	$key	AES_KEY *key
# output:
#	"eax"	return code
#	$round	rounds

&function_begin_B("_aesni_set_encrypt_key");
	&push	("ebp");
	&push	("ebx");
	&test	("eax","eax");
	&jz	(&label("bad_pointer"));
	&test	($key,$key);
	&jz	(&label("bad_pointer"));

	&call	(&label("pic"));
&set_label("pic");
	&blindpop("ebx");
	&lea	("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));

	&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
	&movups	("xmm0",&QWP(0,"eax"));	# pull first 128 bits of *userKey
	&xorps	("xmm4","xmm4");	# low dword of xmm4 is assumed 0
	&mov	("ebp",&DWP(4,"ebp"));
	&lea	($key,&DWP(16,$key));
	&and	("ebp",1<<28|1<<11);	# AVX and XOP bits
	&cmp	($rounds,256);
	&je	(&label("14rounds"));
	&cmp	($rounds,192);
	&je	(&label("12rounds"));
	&cmp	($rounds,128);
	&jne	(&label("bad_keybits"));

&set_label("10rounds",16);
	&cmp		("ebp",1<<28);
	&je		(&label("10rounds_alt"));

	&mov		($rounds,9);
	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
	&aeskeygenassist("xmm1","xmm0",0x01);		# round 1
	&call		(&label("key_128_cold"));
	&aeskeygenassist("xmm1","xmm0",0x2);		# round 2
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x04);		# round 3
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x08);		# round 4
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x10);		# round 5
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x20);		# round 6
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x40);		# round 7
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x80);		# round 8
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x1b);		# round 9
	&call		(&label("key_128"));
	&aeskeygenassist("xmm1","xmm0",0x36);		# round 10
	&call		(&label("key_128"));
	&$movekey	(&QWP(0,$key),"xmm0");
	&mov		(&DWP(80,$key),$rounds);

	&jmp	(&label("good_key"));

&set_label("key_128",16);
	&$movekey	(&QWP(0,$key),"xmm0");
	&lea		($key,&DWP(16,$key));
&set_label("key_128_cold");
	&shufps		("xmm4","xmm0",0b00010000);
	&xorps		("xmm0","xmm4");
	&shufps		("xmm4","xmm0",0b10001100);
	&xorps		("xmm0","xmm4");
	&shufps		("xmm1","xmm1",0b11111111);	# critical path
	&xorps		("xmm0","xmm1");
	&ret();

&set_label("10rounds_alt",16);
	&movdqa		("xmm5",&QWP(0x00,"ebx"));
	&mov		($rounds,8);
	&movdqa		("xmm4",&QWP(0x20,"ebx"));
	&movdqa		("xmm2","xmm0");
	&movdqu		(&QWP(-16,$key),"xmm0");

&set_label("loop_key128");
	&pshufb		("xmm0","xmm5");
	&aesenclast	("xmm0","xmm4");
	&pslld		("xmm4",1);
	&lea		($key,&DWP(16,$key));

	&movdqa		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm2","xmm3");

	&pxor		("xmm0","xmm2");
	&movdqu		(&QWP(-16,$key),"xmm0");
	&movdqa		("xmm2","xmm0");

	&dec		($rounds);
	&jnz		(&label("loop_key128"));

	&movdqa		("xmm4",&QWP(0x30,"ebx"));

	&pshufb		("xmm0","xmm5");
	&aesenclast	("xmm0","xmm4");
	&pslld		("xmm4",1);

	&movdqa		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm2","xmm3");

	&pxor		("xmm0","xmm2");
	&movdqu		(&QWP(0,$key),"xmm0");

	&movdqa		("xmm2","xmm0");
	&pshufb		("xmm0","xmm5");
	&aesenclast	("xmm0","xmm4");

	&movdqa		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm3","xmm2");
	&pslldq		("xmm2",4);
	&pxor		("xmm2","xmm3");

	&pxor		("xmm0","xmm2");
	&movdqu		(&QWP(16,$key),"xmm0");

	&mov		($rounds,9);
	&mov		(&DWP(96,$key),$rounds);

	&jmp	(&label("good_key"));

&set_label("12rounds",16);
	&movq		("xmm2",&QWP(16,"eax"));	# remaining 1/3 of *userKey
	&cmp		("ebp",1<<28);
	&je		(&label("12rounds_alt"));

	&mov		($rounds,11);
	&$movekey	(&QWP(-16,$key),"xmm0");	# round 0
	&aeskeygenassist("xmm1","xmm2",0x01);		# round 1,2
	&call		(&label("key_192a_cold"));
	&aeskeygenassist("xmm1","xmm2",0x02);		# round 2,3
	&call		(&label("key_192b"));
	&aeskeygenassist("xmm1","xmm2",0x04);		# round 4,5
	&call		(&label("key_192a"));
	&aeskeygenassist("xmm1","xmm2",0x08);		# round 5,6
	&call		(&label("key_192b"));
	&aeskeygenassist("xmm1","xmm2",0x10);		# round 7,8
	&call		(&label("key_192a"));
	&aeskeygenassist("xmm1","xmm2",0x20);		# round 8,9
	&call		(&label("key_192b"));
	&aeskeygenassist("xmm1","xmm2",0x40);		# round 10,11
	&call		(&label("key_192a"));
	&aeskeygenassist("xmm1","xmm2",0x80);		# round 11,12
	&call		(&label("key_192b"));
	&$movekey	(&QWP(0,$key),"xmm0");
	&mov		(&DWP(48,$key),$rounds);

	&jmp	(&label("good_key"));

&set_label("key_192a",16);
	&$movekey	(&QWP(0,$key),"xmm0");
	&lea		($key,&DWP(16,$key));
&set_label("key_192a_cold",16);
	&movaps		("xmm5","xmm2");
&set_label("key_192b_warm");
	&shufps		("xmm4","xmm0",0b00010000);
	&movdqa		("xmm3","xmm2");
	&xorps		("xmm0","xmm4");
	&shufps		("xmm4","xmm0",0b10001100);
	&pslldq		("xmm3",4);
	&xorps		("xmm0","xmm4");
	&pshufd		("xmm1","xmm1",0b01010101);	# critical path
	&pxor		("xmm2","xmm3");
	&pxor		("xmm0","xmm1");
	&pshufd		("xmm3","xmm0",0b11111111);
	&pxor		("xmm2","xmm3");
	&ret();

&set_label("key_192b",16);
	&movaps		("xmm3","xmm0");
	&shufps		("xmm5","xmm0",0b01000100);
	&$movekey	(&QWP(0,$key),"xmm5");
	&shufps		("xmm3","xmm2",0b01001110);
	&$movekey	(&QWP(16,$key),"xmm3");
	&lea		($key,&DWP(32,$key));
	&jmp		(&label("key_192b_warm"));

&set_label("12rounds_alt",16);
	&movdqa		("xmm5",&QWP(0x10,"ebx"));
	&movdqa		("xmm4",&QWP(0x20,"ebx"));
	&mov		($rounds,8);
	&movdqu		(&QWP(-16,$key),"xmm0");

&set_label("loop_key192");
	&movq		(&QWP(0,$key),"xmm2");
	&movdqa		("xmm1","xmm2");
	&pshufb		("xmm2","xmm5");
	&aesenclast	("xmm2","xmm4");
	&pslld		("xmm4",1);
	&lea		($key,&DWP(24,$key));

	&movdqa		("xmm3","xmm0");
	&pslldq		("xmm0",4);
	&pxor		("xmm3","xmm0");
	&pslldq		("xmm0",4);
	&pxor		("xmm3","xmm0");
	&pslldq		("xmm0",4);
	&pxor		("xmm0","xmm3");

	&pshufd		("xmm3","xmm0",0xff);
	&pxor		("xmm3","xmm1");
	&pslldq		("xmm1",4);
	&pxor		("xmm3","xmm1");

	&pxor		("xmm0","xmm2");
	&pxor		("xmm2","xmm3");
	&movdqu		(&QWP(-16,$key),"xmm0");

	&dec		($rounds);
	&jnz		(&label("loop_key192"));

	&mov	($rounds,11);
	&mov	(&DWP(32,$key),$rounds);

	&jmp	(&label("good_key"));

&set_label("14rounds",16);
	&movups		("xmm2",&QWP(16,"eax"));	# remaining half of *userKey
	&lea		($key,&DWP(16,$key));
	&cmp		("ebp",1<<28);
	&je		(&label("14rounds_alt"));

	&mov		($rounds,13);
	&$movekey	(&QWP(-32,$key),"xmm0");	# round 0
	&$movekey	(&QWP(-16,$key),"xmm2");	# round 1
	&aeskeygenassist("xmm1","xmm2",0x01);		# round 2
	&call		(&label("key_256a_cold"));
	&aeskeygenassist("xmm1","xmm0",0x01);		# round 3
	&call		(&label("key_256b"));
	&aeskeygenassist("xmm1","xmm2",0x02);		# round 4
	&call		(&label("key_256a"));
	&aeskeygenassist("xmm1","xmm0",0x02);		# round 5
	&call		(&label("key_256b"));
	&aeskeygenassist("xmm1","xmm2",0x04);		# round 6
	&call		(&label("key_256a"));
	&aeskeygenassist("xmm1","xmm0",0x04);		# round 7
	&call		(&label("key_256b"));
	&aeskeygenassist("xmm1","xmm2",0x08);		# round 8
	&call		(&label("key_256a"));
	&aeskeygenassist("xmm1","xmm0",0x08);		# round 9
	&call		(&label("key_256b"));
	&aeskeygenassist("xmm1","xmm2",0x10);		# round 10
	&call		(&label("key_256a"));
	&aeskeygenassist("xmm1","xmm0",0x10);		# round 11
	&call		(&label("key_256b"));
	&aeskeygenassist("xmm1","xmm2",0x20);		# round 12
	&call		(&label("key_256a"));
	&aeskeygenassist("xmm1","xmm0",0x20);		# round 13
	&call		(&label("key_256b"));
	&aeskeygenassist("xmm1","xmm2",0x40);		# round 14
	&call		(&label("key_256a"));
	&$movekey	(&QWP(0,$key),"xmm0");
	&mov		(&DWP(16,$key),$rounds);
	&xor		("eax","eax");

	&jmp	(&label("good_key"));

&set_label("key_256a",16);
	&$movekey	(&QWP(0,$key),"xmm2");
	&lea		($key,&DWP(16,$key));
&set_label("key_256a_cold");
	&shufps		("xmm4","xmm0",0b00010000);
	&xorps		("xmm0","xmm4");
	&shufps		("xmm4","xmm0",0b10001100);
	&xorps		("xmm0","xmm4");
	&shufps		("xmm1","xmm1",0b11111111);	# critical path
	&xorps		("xmm0","xmm1");
	&ret();

&set_label("key_256b",16);
	&$movekey	(&QWP(0,$key),"xmm0");
	&lea		($key,&DWP(16,$key));

	&shufps		("xmm4","xmm2",0b00010000);
	&xorps		("xmm2","xmm4");
	&shufps		("xmm4","xmm2",0b10001100);
	&xorps		("xmm2","xmm4");
	&shufps		("xmm1","xmm1",0b10101010);	# critical path
	&xorps		("xmm2","xmm1");
	&ret();

&set_label("14rounds_alt",16);
	&movdqa		("xmm5",&QWP(0x00,"ebx"));
	&movdqa		("xmm4",&QWP(0x20,"ebx"));
	&mov		($rounds,7);
	&movdqu		(&QWP(-32,$key),"xmm0");
	&movdqa		("xmm1","xmm2");
	&movdqu		(&QWP(-16,$key),"xmm2");

&set_label("loop_key256");
	&pshufb		("xmm2","xmm5");
	&aesenclast	("xmm2","xmm4");

	&movdqa		("xmm3","xmm0");
	&pslldq		("xmm0",4);
	&pxor		("xmm3","xmm0");
	&pslldq		("xmm0",4);
	&pxor		("xmm3","xmm0");
	&pslldq		("xmm0",4);
	&pxor		("xmm0","xmm3");
	&pslld		("xmm4",1);

	&pxor		("xmm0","xmm2");
	&movdqu		(&QWP(0,$key),"xmm0");

	&dec		($rounds);
	&jz		(&label("done_key256"));

	&pshufd		("xmm2","xmm0",0xff);
	&pxor		("xmm3","xmm3");
	&aesenclast	("xmm2","xmm3");

	&movdqa		("xmm3","xmm1")
	&pslldq		("xmm1",4);
	&pxor		("xmm3","xmm1");
	&pslldq		("xmm1",4);
	&pxor		("xmm3","xmm1");
	&pslldq		("xmm1",4);
	&pxor		("xmm1","xmm3");

	&pxor		("xmm2","xmm1");
	&movdqu		(&QWP(16,$key),"xmm2");
	&lea		($key,&DWP(32,$key));
	&movdqa		("xmm1","xmm2");
	&jmp		(&label("loop_key256"));

&set_label("done_key256");
	&mov		($rounds,13);
	&mov		(&DWP(16,$key),$rounds);

&set_label("good_key");
	&pxor	("xmm0","xmm0");
	&pxor	("xmm1","xmm1");
	&pxor	("xmm2","xmm2");
	&pxor	("xmm3","xmm3");
	&pxor	("xmm4","xmm4");
	&pxor	("xmm5","xmm5");
	&xor	("eax","eax");
	&pop	("ebx");
	&pop	("ebp");
	&ret	();

&set_label("bad_pointer",4);
	&mov	("eax",-1);
	&pop	("ebx");
	&pop	("ebp");
	&ret	();
&set_label("bad_keybits",4);
	&pxor	("xmm0","xmm0");
	&mov	("eax",-2);
	&pop	("ebx");
	&pop	("ebp");
	&ret	();
&function_end_B("_aesni_set_encrypt_key");

# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
#                              AES_KEY *key)
&function_begin_B("${PREFIX}_set_encrypt_key");
	&mov	("eax",&wparam(0));
	&mov	($rounds,&wparam(1));
	&mov	($key,&wparam(2));
	&call	("_aesni_set_encrypt_key");
	&ret	();
&function_end_B("${PREFIX}_set_encrypt_key");

# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
#                              AES_KEY *key)
&function_begin_B("${PREFIX}_set_decrypt_key");
	&mov	("eax",&wparam(0));
	&mov	($rounds,&wparam(1));
	&mov	($key,&wparam(2));
	&call	("_aesni_set_encrypt_key");
	&mov	($key,&wparam(2));
	&shl	($rounds,4);	# rounds-1 after _aesni_set_encrypt_key
	&test	("eax","eax");
	&jnz	(&label("dec_key_ret"));
	&lea	("eax",&DWP(16,$key,$rounds));	# end of key schedule

	&$movekey	("xmm0",&QWP(0,$key));	# just swap
	&$movekey	("xmm1",&QWP(0,"eax"));
	&$movekey	(&QWP(0,"eax"),"xmm0");
	&$movekey	(&QWP(0,$key),"xmm1");
	&lea		($key,&DWP(16,$key));
	&lea		("eax",&DWP(-16,"eax"));

&set_label("dec_key_inverse");
	&$movekey	("xmm0",&QWP(0,$key));	# swap and inverse
	&$movekey	("xmm1",&QWP(0,"eax"));
	&aesimc		("xmm0","xmm0");
	&aesimc		("xmm1","xmm1");
	&lea		($key,&DWP(16,$key));
	&lea		("eax",&DWP(-16,"eax"));
	&$movekey	(&QWP(16,"eax"),"xmm0");
	&$movekey	(&QWP(-16,$key),"xmm1");
	&cmp		("eax",$key);
	&ja		(&label("dec_key_inverse"));

	&$movekey	("xmm0",&QWP(0,$key));	# inverse middle
	&aesimc		("xmm0","xmm0");
	&$movekey	(&QWP(0,$key),"xmm0");

	&pxor		("xmm0","xmm0");
	&pxor		("xmm1","xmm1");
	&xor		("eax","eax");		# return success
&set_label("dec_key_ret");
	&ret	();
&function_end_B("${PREFIX}_set_decrypt_key");

&set_label("key_const",64);
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
&data_word(1,1,1,1);
&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");

&asm_finish();