From 6c83629bd9e14be3e76e3ee0da2d3a811a190487 Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Sat, 10 Apr 2010 13:56:59 +0000
Subject: [PATCH] AESNI engine: add counter mode.

---
 crypto/aes/asm/aesni-x86.pl    | 211 ++++++++++++++++++---
 crypto/aes/asm/aesni-x86_64.pl | 324 ++++++++++++++++++++++++++++-----
 crypto/engine/eng_aesni.c      | 178 +++++++++++++++++-
 3 files changed, 642 insertions(+), 71 deletions(-)

diff --git a/crypto/aes/asm/aesni-x86.pl b/crypto/aes/asm/aesni-x86.pl
index 72faa78d1..8c1426cd5 100644
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@@ -23,7 +23,8 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],$0);
 
-$movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
+if ($PREFIX eq "aesni")	{ $movekey=*movaps; }
+else			{ $movekey=*movups; }
 
 $len="eax";
 $rounds="ecx";
@@ -41,7 +42,7 @@ $rndkey1="xmm4";
 $ivec="xmm5";
 $in0="xmm6";
 $in1="xmm7";	$inout3="xmm7";
-
+
 # Inline version of internal aesni_[en|de]crypt1
 sub aesni_inline_generate1
 { my $p=shift;
@@ -104,7 +105,7 @@ sub aesni_generate1	# fully unrolled loop
     &ret();
     &function_end_B("_aesni_${p}rypt1");
 }
-
+
 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
 &aesni_generate1("enc") if (!$inline);
 &function_begin_B("${PREFIX}_encrypt");
@@ -136,7 +137,7 @@ sub aesni_generate1	# fully unrolled loop
 	&movups	(&QWP(0,"eax"),$inout0);
 	&ret	();
 &function_end_B("${PREFIX}_decrypt");
-
+
 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
 # latency is 6, it turned out that it can be scheduled only every
@@ -229,8 +230,9 @@ sub aesni_generate4
 &aesni_generate3("dec");
 &aesni_generate4("enc") if ($PREFIX eq "aesni");
 &aesni_generate4("dec");
-
+
 if ($PREFIX eq "aesni") {
+######################################################################
 # void aesni_ecb_encrypt (const void *in, void *out,
 #                         size_t length, const AES_KEY *key,
 #                         int enc);
@@ -249,8 +251,9 @@ if ($PREFIX eq "aesni") {
 	&mov	($rounds_,$rounds);	# backup $rounds
 	&jz	(&label("ecb_decrypt"));
 
-	&sub	($len,0x40);
+	&cmp	($len,0x40);
 	&jbe	(&label("ecb_enc_tail"));
+	&sub	($len,0x40);
 	&jmp	(&label("ecb_enc_loop3"));
 
 &set_label("ecb_enc_loop3",16);
@@ -268,14 +271,13 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(-0x10,$out),$inout2);
 	&ja	(&label("ecb_enc_loop3"));
 
-&set_label("ecb_enc_tail");
 	&add	($len,0x40);
 	&jz	(&label("ecb_ret"));
 
-	&cmp	($len,0x10);
-	&movups	($inout0,&QWP(0,$inp));
-	&je	(&label("ecb_enc_one"));
+&set_label("ecb_enc_tail");
 	&cmp	($len,0x20);
+	&movups	($inout0,&QWP(0,$inp));
+	&jb	(&label("ecb_enc_one"));
 	&movups	($inout1,&QWP(0x10,$inp));
 	&je	(&label("ecb_enc_two"));
 	&cmp	($len,0x30);
@@ -309,10 +311,11 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(0x10,$out),$inout1);
 	&movups	(&QWP(0x20,$out),$inout2);
 	&jmp	(&label("ecb_ret"));
-
+######################################################################
 &set_label("ecb_decrypt",16);
-	&sub	($len,0x40);
+	&cmp	($len,0x40);
 	&jbe	(&label("ecb_dec_tail"));
+	&sub	($len,0x40);
 	&jmp	(&label("ecb_dec_loop3"));
 
 &set_label("ecb_dec_loop3",16);
@@ -330,14 +333,13 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(-0x10,$out),$inout2);
 	&ja	(&label("ecb_dec_loop3"));
 
-&set_label("ecb_dec_tail");
 	&add	($len,0x40);
 	&jz	(&label("ecb_ret"));
 
-	&cmp	($len,0x10);
-	&movups	($inout0,&QWP(0,$inp));
-	&je	(&label("ecb_dec_one"));
+&set_label("ecb_dec_tail");
 	&cmp	($len,0x20);
+	&movups	($inout0,&QWP(0,$inp));
+	&jb	(&label("ecb_dec_one"));
 	&movups	($inout1,&QWP(0x10,$inp));
 	&je	(&label("ecb_dec_two"));
 	&cmp	($len,0x30);
@@ -373,8 +375,173 @@ if ($PREFIX eq "aesni") {
 
 &set_label("ecb_ret");
 &function_end("aesni_ecb_encrypt");
-}
+
+######################################################################
+# handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+&function_begin("aesni_ctr32_encrypt_blocks");
+	&mov	($inp,&wparam(0));
+	&mov	($out,&wparam(1));
+	&mov	($len,&wparam(2));
+	&mov	($key,&wparam(3));
+	&mov	($rounds_,&wparam(4));
+	&mov	($key_,"esp");
+	&sub	("esp",60);
+	&and	("esp",-16);			# align stack
+	&mov	(&DWP(48,"esp"),$key_);
 
+	&movups	($inout3,&QWP(0,$rounds_));	# load ivec
+
+	# compose byte-swap control mask for pshufb on stack
+	&mov	(&DWP(0,"esp"),0x0c0d0e0f);
+	&mov	(&DWP(4,"esp"),0x08090a0b);
+	&mov	(&DWP(8,"esp"),0x04050607);
+	&mov	(&DWP(12,"esp"),0x00010203);
+
+	# compose counter increment vector on stack
+	&mov	($rounds,3);
+	&xor	($key_,$key_);
+	&mov	(&DWP(16,"esp"),$rounds);
+	&mov	(&DWP(20,"esp"),$rounds);
+	&mov	(&DWP(24,"esp"),$rounds);
+	&mov	(&DWP(28,"esp"),$key_);
+
+	&pextrd	($rounds_,$inout3,3);		# pull 32-bit counter
+	&pinsrd	($inout3,$key_,3);		# wipe 32-bit counter
+
+	&mov	($rounds,&DWP(240,$key));	# key->rounds
+	&movaps	($rndkey0,&QWP(0,"esp"));	# load byte-swap mask
+
+	# $ivec is vector of 3 32-bit counters
+	&pxor	($ivec,$ivec);
+	&bswap	($rounds_);
+	&pinsrd	($ivec,$rounds_,0);
+	&inc	($rounds_);
+	&pinsrd	($ivec,$rounds_,1);
+	&inc	($rounds_);
+	&pinsrd	($ivec,$rounds_,2);
+
+	&cmp	($len,4);
+	&pshufb	($ivec,$rndkey0);		# byte swap
+	&jbe	(&label("ctr32_tail"));
+	&movaps	(&QWP(32,"esp"),$inout3);	# save counter-less ivec
+	&mov	($rounds_,$rounds);
+	&mov	($key_,$key);
+	&sub	($len,4);
+	&jmp	(&label("ctr32_loop3"));
+
+&set_label("ctr32_loop3",16);
+	&pshufd	($inout0,$ivec,3<<6);		# place counter to upper dword
+	&pshufd	($inout1,$ivec,2<<6);
+	&pshufd	($inout2,$ivec,1<<6);
+	&por	($inout0,$inout3);		# merge counter-less ivec
+	&por	($inout1,$inout3);
+	&por	($inout2,$inout3);
+
+	&call	("_aesni_encrypt3");
+
+	 &movaps($rndkey0,&QWP(0,"esp"));	# load byte-swap mask
+	&movups	($in0,&QWP(0,$inp));
+	&movups	($in1,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	 &pshufb($ivec,$rndkey0);		# byte swap
+	 &paddd	($ivec,&QWP(16,"esp"));		# counter increment
+	&pxor	($in0,$inout0);
+	&pxor	($in1,$inout1);
+	&pxor	($rndkey1,$inout2);
+	&movups	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0x10,$out),$in1);
+	&movups	(&QWP(0x20,$out),$rndkey1);
+	&movaps	($inout3,&QWP(32,"esp"));	# load counter-less ivec
+	 &pshufb($ivec,$rndkey0);		# byte swap
+
+	&sub	($len,3);
+	&lea	($inp,&DWP(0x30,$inp));
+	&lea	($out,&DWP(0x30,$out));
+	&mov	($key,$key_);
+	&mov	($rounds,$rounds_);
+	&ja	(&label("ctr32_loop3"));
+
+	&add	($len,4);
+	&pextrd	($rounds_,$ivec,1);		# might need last counter value
+	&jz	(&label("ctr32_ret"));
+	&bswap	($rounds_);
+
+&set_label("ctr32_tail");
+	&cmp	($len,2);
+	&pshufd	($inout0,$ivec,3<<6);
+	&pshufd	($inout1,$ivec,2<<6);
+	&pshufd	($inout2,$ivec,1<<6);
+	&por	($inout0,$inout3);
+	&jb	(&label("ctr32_one"));
+	&por	($inout1,$inout3);
+	&je	(&label("ctr32_two"));
+	&cmp	($len,3);
+	&por	($inout2,$inout3);
+	&je	(&label("ctr32_three"));
+
+	&inc	($rounds_);			# compose last counter value
+	&bswap	($rounds_);
+	&pinsrd	($inout3,$rounds_,3);
+
+	&call	("_aesni_encrypt4");
+
+	&movups	($in0,&QWP(0,$inp));
+	&movups	($rndkey1,&QWP(0x10,$inp));
+	&movups	($rndkey0,&QWP(0x20,$inp));
+	&movups	($ivec,&QWP(0x30,$inp));
+	&pxor	($in0,$inout0);
+	&pxor	($rndkey1,$inout1);
+	&pxor	($rndkey0,$inout2);
+	&pxor	($ivec,$inout3);
+	&movups	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0x10,$out),$rndkey1);
+	&movups	(&QWP(0x20,$out),$rndkey0);
+	&movups	(&QWP(0x30,$out),$ivec);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_one",16);
+	if ($inline)
+	{   &aesni_inline_generate1("enc");	}
+	else
+	{   &call	("_aesni_encrypt1");	}
+	&movups	($in0,&QWP(0,$inp));
+	&pxor	($in0,$inout0);
+	&movups	(&QWP(0,$out),$in0);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_two",16);
+	&call	("_aesni_encrypt3");
+	&movups	($in0,&QWP(0,$inp));
+	&movups	($in1,&QWP(0x10,$inp));
+	&pxor	($in0,$inout0);
+	&pxor	($in1,$inout1);
+	&movups	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0x10,$out),$in1);
+	&jmp	(&label("ctr32_ret"));
+
+&set_label("ctr32_three",16);
+	&call	("_aesni_encrypt3");
+	&movups	($in0,&QWP(0,$inp));
+	&movups	($in1,&QWP(0x10,$inp));
+	&movups	($rndkey1,&QWP(0x20,$inp));
+	&pxor	($in0,$inout0);
+	&pxor	($in1,$inout1);
+	&pxor	($rndkey1,$inout2);
+	&movups	(&QWP(0,$out),$in0);
+	&movups	(&QWP(0x10,$out),$in1);
+	&movups	(&QWP(0x20,$out),$rndkey1);
+
+&set_label("ctr32_ret");
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("aesni_ctr32_encrypt_blocks");
+}
+
+######################################################################
 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
 #                           size_t length, const AES_KEY *key,
 #                           unsigned char *ivp,const int enc);
@@ -431,10 +598,11 @@ if ($PREFIX eq "aesni") {
 	&mov	($inp,$out);		# $inp and $out are the same
 	&mov	($key,$key_);		# restore $key
 	&jmp	(&label("cbc_enc_loop"));
-
+######################################################################
 &set_label("cbc_decrypt",16);
-	&sub	($len,0x40);
+	&cmp	($len,0x40);
 	&jbe	(&label("cbc_dec_tail"));
+	&sub	($len,0x40);
 	&jmp	(&label("cbc_dec_loop3"));
 
 &set_label("cbc_dec_loop3",16);
@@ -458,10 +626,10 @@ if ($PREFIX eq "aesni") {
 	&movups	(&QWP(-0x10,$out),$inout2);
 	&ja	(&label("cbc_dec_loop3"));
 
-&set_label("cbc_dec_tail");
 	&add	($len,0x40);
 	&jz	(&label("cbc_ret"));
 
+&set_label("cbc_dec_tail");
 	&movups	($inout0,&QWP(0,$inp));
 	&cmp	($len,0x10);
 	&movaps	($in0,$inout0);
@@ -539,7 +707,8 @@ if ($PREFIX eq "aesni") {
 	&mov	($key_,&wparam(4));
 	&movups	(&QWP(0,$key_),$ivec);	# output IV
 &function_end("${PREFIX}_cbc_encrypt");
-
+
+######################################################################
 # Mechanical port from aesni-x86_64.pl.
 #
 # _aesni_set_encrypt_key is private interface,
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl
index cdc076e24..d8697519e 100644
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -41,7 +41,7 @@ $inp="%rdi";
 $out="%rsi";
 $len="%rdx";
 $key="%rcx";	# input to and changed by aesni_[en|de]cryptN !!!
-$ivp="%r8";	# cbc
+$ivp="%r8";	# cbc, ctr
 
 $rnds_="%r10d";	# backup copy for $rounds
 $key_="%r11";	# backup copy for $key
@@ -51,7 +51,7 @@ $inout0="%xmm0";	$inout1="%xmm1";
 $inout2="%xmm2";	$inout3="%xmm3";
 $rndkey0="%xmm4";	$rndkey1="%xmm5";
 
-$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt
+$iv="%xmm6";		$in0="%xmm7";	# used in CBC decrypt, CTR
 $in1="%xmm8";		$in2="%xmm9";
 
 # Inline version of internal aesni_[en|de]crypt1.
@@ -214,6 +214,7 @@ ___
 &aesni_generate4("dec");
 
 if ($PREFIX eq "aesni") {
+########################################################################
 # void aesni_ecb_encrypt (const void *in, void *out,
 #			  size_t length, const AES_KEY *key,
 #			  int enc);
@@ -232,8 +233,9 @@ aesni_ecb_encrypt:
 	mov	$rounds,$rnds_		# backup $rounds
 	jz	.Lecb_decrypt
 #--------------------------- ECB ENCRYPT ------------------------------#
-	sub	\$0x40,$len
+	cmp	\$0x40,$len
 	jbe	.Lecb_enc_tail
+	sub	\$0x40,$len
 	jmp	.Lecb_enc_loop3
 .align 16
 .Lecb_enc_loop3:
@@ -251,14 +253,13 @@ aesni_ecb_encrypt:
 	movups	$inout2,-0x10($out)
 	ja	.Lecb_enc_loop3
 
-.Lecb_enc_tail:
 	add	\$0x40,$len
 	jz	.Lecb_ret
 
-	cmp	\$0x10,$len
-	movups	($inp),$inout0
-	je	.Lecb_enc_one
+.Lecb_enc_tail:
 	cmp	\$0x20,$len
+	movups	($inp),$inout0
+	jb	.Lecb_enc_one
 	movups	0x10($inp),$inout1
 	je	.Lecb_enc_two
 	cmp	\$0x30,$len
@@ -294,8 +295,9 @@ $code.=<<___;
 #--------------------------- ECB DECRYPT ------------------------------#
 .align	16
 .Lecb_decrypt:
-	sub	\$0x40,$len
+	cmp	\$0x40,$len
 	jbe	.Lecb_dec_tail
+	sub	\$0x40,$len
 	jmp	.Lecb_dec_loop3
 .align 16
 .Lecb_dec_loop3:
@@ -313,14 +315,13 @@ $code.=<<___;
 	movups	$inout2,-0x10($out)
 	ja	.Lecb_dec_loop3
 
-.Lecb_dec_tail:
 	add	\$0x40,$len
 	jz	.Lecb_ret
 
-	cmp	\$0x10,$len
-	movups	($inp),$inout0
-	je	.Lecb_dec_one
+.Lecb_dec_tail:
 	cmp	\$0x20,$len
+	movups	($inp),$inout0
+	jb	.Lecb_dec_one
 	movups	0x10($inp),$inout1
 	je	.Lecb_dec_two
 	cmp	\$0x30,$len
@@ -357,8 +358,175 @@ $code.=<<___;
 	ret
 .size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
 ___
+######################################################################
+# handles only complete blocks, operates on 32-bit counter and
+# does not update *ivec! (see engine/eng_aesni.c for details)
+#
+# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
+#                         size_t blocks, const AES_KEY *key,
+#                         const char *ivec);
+$increment="%xmm10";
+$bswap_mask="%xmm11";
+
+$code.=<<___;
+.globl	aesni_ctr32_encrypt_blocks
+.type	aesni_ctr32_encrypt_blocks,\@function,5
+.align	16
+aesni_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($win64);
+	lea	-0x68(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+	movaps	%xmm8,0x20(%rsp)
+	movaps	%xmm9,0x30(%rsp)
+	movaps	%xmm10,0x40(%rsp)
+	movaps	%xmm11,0x50(%rsp)
+
+.Lctr32_body:
+___
+$code.=<<___;
+	movups	($ivp),$inout3
+	movaps	.Lincrement(%rip),$increment
+	movaps	.Lbswap_mask(%rip),$bswap_mask
+	xor	$rounds,$rounds
+	pextrd	\$3,$inout3,$rnds_		# pull 32-bit counter
+	pinsrd	\$3,$rounds,$inout3		# wipe 32-bit counter
+
+	mov	240($key),$rounds		# key->rounds
+	pxor	$iv,$iv				# vector of 3 32-bit counters
+	bswap	$rnds_
+	pinsrd	\$0,$rnds_,$iv
+	inc	$rnds_
+	pinsrd	\$1,$rnds_,$iv
+	inc	$rnds_
+	pinsrd	\$2,$rnds_,$iv
+
+	cmp	\$4,$len
+	pshufb	$bswap_mask,$iv
+	jbe	.Lctr32_tail
+	mov	$rounds,$rnds_
+	mov	$key,$key_
+	sub	\$4,$len
+	jmp	.Lctr32_loop3
+
+.align	16
+.Lctr32_loop3:
+	pshufd	\$`3<<6`,$iv,$inout0		# place counter to upper dword
+	pshufd	\$`2<<6`,$iv,$inout1
+	pshufd	\$`1<<6`,$iv,$inout2
+	movups	($inp),$in0
+	movups	0x10($inp),$in1
+	movups	0x20($inp),$in2
+	por	$inout3,$inout0			# merge counter-less ivec
+	por	$inout3,$inout1
+	por	$inout3,$inout2
+	pshufb	$bswap_mask,$iv
+
+	call	_aesni_encrypt3
+
+	paddd	$increment,$iv
+	pxor	$inout0,$in0
+	pxor	$inout1,$in1
+	pxor	$inout2,$in2
+	pshufb	$bswap_mask,$iv
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+
+	sub	\$3,$len
+	lea	0x30($inp),$inp
+	lea	0x30($out),$out
+	mov	$key_,$key
+	mov	$rnds_,$rounds
+	ja	.Lctr32_loop3
+
+	add	\$4,$len
+	pextrd	\$1,$iv,$rnds_			# migh need last counter value
+	jz	.Lctr32_done
+	bswap	$rnds_
+
+.Lctr32_tail:
+	cmp	\$2,$len
+	pshufd	\$`3<<6`,$iv,$inout0
+	pshufd	\$`2<<6`,$iv,$inout1
+	pshufd	\$`1<<6`,$iv,$inout2
+	por	$inout3,$inout0
+	movups	($inp),$in0
+	jb	.Lctr32_one
+	por	$inout3,$inout1
+	movups	0x10($inp),$in1
+	je	.Lctr32_two
+	cmp	\$3,$len
+	por	$inout3,$inout2
+	movups	0x20($inp),$in2
+	je	.Lctr32_three
+
+	inc	$rnds_				# compose last counter value
+	bswap	$rnds_
+	pinsrd	\$3,$rnds_,$inout3
+	movups	0x30($inp),$iv
+
+	call	_aesni_encrypt4
+
+	pxor	$inout0,$in0
+	pxor	$inout1,$in1
+	pxor	$inout2,$in2
+	pxor	$inout3,$iv
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+	movups	$iv,0x30($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_one:
+___
+	&aesni_generate1("enc",$key,$rounds);
+$code.=<<___;
+	pxor	$inout0,$in0
+	movups	$in0,($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_two:
+	call	_aesni_encrypt3
+	pxor	$inout0,$in0
+	pxor	$inout1,$in1
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	jmp	.Lctr32_done
+
+.align	16
+.Lctr32_three:
+	call	_aesni_encrypt3
+	pxor	$inout0,$in0
+	pxor	$inout1,$in1
+	pxor	$inout2,$in2
+	movups	$in0,($out)
+	movups	$in1,0x10($out)
+	movups	$in2,0x20($out)
+
+.Lctr32_done:
+___
+
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	lea	0x68(%rsp),%rsp
+___
+$code.=<<___;
+.Lctr32_ret:
+	ret
+.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
+___
 }
 
+########################################################################
 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
 #			    size_t length, const AES_KEY *key,
 #			    unsigned char *ivp,const int enc);
@@ -429,9 +597,10 @@ $code.=<<___ if ($win64);
 ___
 $code.=<<___;
 	movups	($ivp),$iv
-	sub	\$0x40,$len
+	cmp	\$0x40,$len
 	mov	$rnds_,$rounds
 	jbe	.Lcbc_dec_tail
+	sub	\$0x40,$len
 	jmp	.Lcbc_dec_loop3
 .align 16
 .Lcbc_dec_loop3:
@@ -456,11 +625,11 @@ $code.=<<___;
 	movups	$inout2,-0x10($out)
 	ja	.Lcbc_dec_loop3
 
-.Lcbc_dec_tail:
 	add	\$0x40,$len
 	movups	$iv,($ivp)
 	jz	.Lcbc_dec_ret
 
+.Lcbc_dec_tail:
 	movups	($inp),$inout0
 	cmp	\$0x10,$len
 	movaps	$inout0,$in0
@@ -796,6 +965,11 @@ ___
 }
 
 $code.=<<___;
+.align	64
+.Lbswap_mask:
+	.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
+.Lincrement:
+	.long	3,3,3,0
 .asciz  "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
 .align	64
 ___
@@ -810,6 +984,75 @@ $disp="%r9";
 
 $code.=<<___;
 .extern	__imp_RtlVirtualUnwind
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+.type	ecb_se_handler,\@abi-omnipotent
+.align	16
+ecb_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	152($context),%rax	# pull context->Rsp
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	jmp	.Lcommon_seh_exit
+.size	ecb_se_handler,.-ecb_se_handler
+
+.type	ctr32_se_handler,\@abi-omnipotent
+.align	16
+ctr32_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lctr32_body(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<"prologue" label
+	jb	.Lin_ctr32_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lctr32_ret(%rip),%r10
+	cmp	%r10,%rbx
+	jae	.Lin_ctr32_prologue
+
+	lea	0(%rax),%rsi		# top of stack
+	lea	512($context),%rdi	# &context.Xmm6
+	mov	\$12,%ecx		# 6*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+	lea	0x68(%rax),%rax		# adjust stack pointer
+
+.Lin_ctr32_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	jmp	.Lcommon_seh_exit
+.size	ctr32_se_handler,.-ctr32_se_handler
+___
+$code.=<<___;
 .type	cbc_se_handler,\@abi-omnipotent
 .align	16
 cbc_se_handler:
@@ -829,55 +1072,32 @@ cbc_se_handler:
 
 	lea	.Lcbc_decrypt(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<"prologue" label
-	jb	.Lin_prologue
+	jb	.Lin_cbc_prologue
 
 	lea	.Lcbc_decrypt_body(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip<cbc_decrypt_body
-	jb	.Lrestore_rax
+	jb	.Lrestore_cbc_rax
 
 	lea	.Lcbc_ret(%rip),%r10
 	cmp	%r10,%rbx		# context->Rip>="epilogue" label
-	jae	.Lin_prologue
+	jae	.Lin_cbc_prologue
 
 	lea	0(%rax),%rsi		# top of stack
 	lea	512($context),%rdi	# &context.Xmm6
 	mov	\$8,%ecx		# 4*sizeof(%xmm0)/sizeof(%rax)
 	.long	0xa548f3fc		# cld; rep movsq
 	lea	0x58(%rax),%rax		# adjust stack pointer
-	jmp	.Lin_prologue
+	jmp	.Lin_cbc_prologue
 
-.Lrestore_rax:
+.Lrestore_cbc_rax:
 	mov	120($context),%rax
-.Lin_prologue:
+.Lin_cbc_prologue:
 	mov	8(%rax),%rdi
 	mov	16(%rax),%rsi
 	mov	%rax,152($context)	# restore context->Rsp
 	mov	%rsi,168($context)	# restore context->Rsi
 	mov	%rdi,176($context)	# restore context->Rdi
 
-	jmp	.Lcommon_seh_exit
-.size	cbc_se_handler,.-cbc_se_handler
-
-.type	ecb_se_handler,\@abi-omnipotent
-.align	16
-ecb_se_handler:
-	push	%rsi
-	push	%rdi
-	push	%rbx
-	push	%rbp
-	push	%r12
-	push	%r13
-	push	%r14
-	push	%r15
-	pushfq
-	sub	\$64,%rsp
-
-	mov	152($context),%rax	# pull context->Rsp
-	mov	8(%rax),%rdi
-	mov	16(%rax),%rsi
-	mov	%rsi,168($context)	# restore context->Rsi
-	mov	%rdi,176($context)	# restore context->Rdi
-
 .Lcommon_seh_exit:
 
 	mov	40($disp),%rdi		# disp->ContextRecord
@@ -915,10 +1135,17 @@ ecb_se_handler:
 
 .section	.pdata
 .align	4
-	.rva	.LSEH_begin_${PREFIX}_ecb_encrypt
-	.rva	.LSEH_end_${PREFIX}_ecb_encrypt
+___
+$code.=<<___ if ($PREFIX eq "aesni");
+	.rva	.LSEH_begin_aesni_ecb_encrypt
+	.rva	.LSEH_end_aesni_ecb_encrypt
 	.rva	.LSEH_info_ecb
 
+	.rva	.LSEH_begin_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_end_aesni_ctr32_encrypt_blocks
+	.rva	.LSEH_info_ctr32
+___
+$code.=<<___;
 	.rva	.LSEH_begin_${PREFIX}_cbc_encrypt
 	.rva	.LSEH_end_${PREFIX}_cbc_encrypt
 	.rva	.LSEH_info_cbc
@@ -932,9 +1159,16 @@ ecb_se_handler:
 	.rva	.LSEH_info_key
 .section	.xdata
 .align	8
+___
+$code.=<<___ if ($PREFIX eq "aesni");
 .LSEH_info_ecb:
 	.byte	9,0,0,0
 	.rva	ecb_se_handler
+.LSEH_info_ctr32:
+	.byte	9,0,0,0
+	.rva	ctr32_se_handler
+___
+$code.=<<___;
 .LSEH_info_cbc:
 	.byte	9,0,0,0
 	.rva	cbc_se_handler
diff --git a/crypto/engine/eng_aesni.c b/crypto/engine/eng_aesni.c
index 2a997cae3..70b2838b4 100644
--- a/crypto/engine/eng_aesni.c
+++ b/crypto/engine/eng_aesni.c
@@ -111,6 +111,35 @@ void ENGINE_load_aesni (void)
 }
 
 #ifdef COMPILE_HW_AESNI
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+
+#if defined(__GNUC__) && __GNUC__>=2
+#  define BSWAP4(x) ({	u32 ret=(x);			\
+			asm volatile ("bswapl %0"	\
+			: "+r"(ret));	ret;		})
+#elif defined(_MSC_VER)
+# if _MSC_VER>=1300
+#  pragma intrinsic(_byteswap_ulong)
+#  define BSWAP4(x)	_byteswap_ulong((u32)(x))
+# elif defined(_M_IX86)
+   __inline u32 _bswap4(u32 val) {
+	_asm mov eax,val
+	_asm bswap eax
+   }
+#  define BSWAP4(x)	_bswap4(x)
+# endif
+#endif
+
+#ifdef BSWAP4
+#define GETU32(p)	BSWAP4(*(const u32 *)(p))
+#define PUTU32(p,v)	*(u32 *)(p) = BSWAP4(v)
+#else
+#define GETU32(p)	((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
+#define PUTU32(p,v)	((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
+#endif
+
 int aesni_set_encrypt_key(const unsigned char *userKey, int bits,
 			      AES_KEY *key);
 int aesni_set_decrypt_key(const unsigned char *userKey, int bits,
@@ -132,6 +161,12 @@ void aesni_cbc_encrypt(const unsigned char *in,
 			   const AES_KEY *key,
 			   unsigned char *ivec, int enc);
 
+void aesni_ctr32_encrypt_blocks(const unsigned char *in,
+			   unsigned char *out,
+			   size_t blocks,
+			   const AES_KEY *key,
+			   const unsigned char *ivec);
+
 /* Function for ENGINE detection and control */
 static int aesni_init(ENGINE *e);
 
@@ -224,16 +259,19 @@ static int aesni_cipher_nids[] = {
 	NID_aes_128_cbc,
 	NID_aes_128_cfb,
 	NID_aes_128_ofb,
+	NID_aes_128_ctr,
 
 	NID_aes_192_ecb,
 	NID_aes_192_cbc,
 	NID_aes_192_cfb,
 	NID_aes_192_ofb,
+	NID_aes_192_ctr,
 
 	NID_aes_256_ecb,
 	NID_aes_256_cbc,
 	NID_aes_256_cfb,
 	NID_aes_256_ofb,
+	NID_aes_256_ctr,
 };
 static int aesni_cipher_nids_num =
 	(sizeof(aesni_cipher_nids)/sizeof(aesni_cipher_nids[0]));
@@ -251,18 +289,28 @@ aesni_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *user_key,
 	int ret;
 	AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
 
-	if ((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CFB_MODE
-	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_OFB_MODE
-	    || enc)
-		ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
-	else
+	if (((ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_ECB_MODE
+	    || (ctx->cipher->flags & EVP_CIPH_MODE) == EVP_CIPH_CBC_MODE)
+	    && !enc)
 		ret=aesni_set_decrypt_key(user_key, ctx->key_len * 8, key);
+	else
+		ret=aesni_set_encrypt_key(user_key, ctx->key_len * 8, key);
 
 	if(ret < 0) {
 		EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_KEY_SETUP_FAILED);
 		return 0;
 	}
 
+	if (ctx->cipher->flags&EVP_CIPH_CUSTOM_IV)
+		{
+		if (iv!=NULL)
+			memcpy (ctx->iv,iv,ctx->cipher->iv_len);
+		else	{
+			EVPerr(EVP_F_AESNI_INIT_KEY,EVP_R_AES_IV_SETUP_FAILED);
+			return 0;
+			}
+		}
+
 	return 1;
 }
 
@@ -336,6 +384,117 @@ DECLARE_AES_EVP(256,cbc,CBC);
 DECLARE_AES_EVP(256,cfb,CFB);
 DECLARE_AES_EVP(256,ofb,OFB);
 
+static void ctr96_inc(unsigned char *counter) {
+	u32 n=12;
+	u8  c;
+
+	do {
+		--n;
+		c = counter[n];
+		++c;
+		counter[n] = c;
+		if (c) return;
+	} while (n);
+}
+
+static int aesni_counter(EVP_CIPHER_CTX *ctx, unsigned char *out,
+		const unsigned char *in, size_t len)
+{
+	AES_KEY *key = AESNI_ALIGN(ctx->cipher_data);
+	u32 n, ctr32;
+	n = ctx->num;
+
+	while (n && len) {
+		*(out++) = *(in++) ^ ctx->buf[n];
+		--len;
+		n = (n+1) % 16;
+	}
+
+	ctr32 = GETU32(ctx->iv+12);
+	while (len>=16) {
+		size_t blocks = len/16;
+		/*
+		 * 1<<24 is just a not-so-small yet not-so-large number...
+		 */
+		if (blocks > (1U<<24)) blocks = (1U<<24);
+		/*
+		 * As aesni_ctr32 operates on 32-bit counter, caller
+		 * has to handle overflow. 'if' below detects the
+		 * overflow, which is then handled by limiting the
+		 * amount of blocks to the exact overflow point...
+		 */
+		ctr32 += (u32)blocks;
+		if (ctr32 < blocks) {
+			blocks -= ctr32;
+			ctr32   = 0;
+		}
+		aesni_ctr32_encrypt_blocks(in,out,blocks,key,ctx->iv);
+		/* aesni_ctr32 does not update ctx->iv, caller does: */
+		PUTU32(ctx->iv+12,ctr32);
+		/* ... overflow was detected, propogate carry. */
+		if (ctr32 == 0)	ctr96_inc(ctx->iv);
+		blocks *= 16;
+		len -= blocks;
+		out += blocks;
+		in  += blocks;
+	}
+	if (len) {
+		aesni_encrypt(ctx->iv,ctx->buf,key);
+		++ctr32;
+		PUTU32(ctx->iv+12,ctr32);
+		if (ctr32 == 0)	ctr96_inc(ctx->iv);
+		while (len--) {
+			out[n] = in[n] ^ ctx->buf[n];
+			++n;
+		}
+	}
+	ctx->num = n;
+
+	return 1;
+}
+
+static const EVP_CIPHER aesni_128_ctr=
+	{
+	NID_aes_128_ctr,1,16,16,
+	EVP_CIPH_CUSTOM_IV,
+	aesni_init_key,
+	aesni_counter,
+	NULL,
+	sizeof(AESNI_KEY),
+	NULL,
+	NULL,
+	NULL,
+	NULL
+	};
+
+static const EVP_CIPHER aesni_192_ctr=
+	{
+	NID_aes_192_ctr,1,24,16,
+	EVP_CIPH_CUSTOM_IV,
+	aesni_init_key,
+	aesni_counter,
+	NULL,
+	sizeof(AESNI_KEY),
+	NULL,
+	NULL,
+	NULL,
+	NULL
+	};
+
+static const EVP_CIPHER aesni_256_ctr=
+	{
+	NID_aes_256_ctr,1,32,16,
+	EVP_CIPH_CUSTOM_IV,
+	aesni_init_key,
+	aesni_counter,
+	NULL,
+	sizeof(AESNI_KEY),
+	NULL,
+	NULL,
+	NULL,
+	NULL
+	};
+
 static int
 aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
 		      const int **nids, int nid)
@@ -360,6 +519,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
 	case NID_aes_128_ofb:
 		*cipher = &aesni_128_ofb;
 		break;
+	case NID_aes_128_ctr:
+		*cipher = &aesni_128_ctr;
+		break;
 
 	case NID_aes_192_ecb:
 		*cipher = &aesni_192_ecb;
@@ -373,6 +535,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
 	case NID_aes_192_ofb:
 		*cipher = &aesni_192_ofb;
 		break;
+	case NID_aes_192_ctr:
+		*cipher = &aesni_192_ctr;
+		break;
 
 	case NID_aes_256_ecb:
 		*cipher = &aesni_256_ecb;
@@ -386,6 +551,9 @@ aesni_ciphers (ENGINE *e, const EVP_CIPHER **cipher,
 	case NID_aes_256_ofb:
 		*cipher = &aesni_256_ofb;
 		break;
+	case NID_aes_256_ctr:
+		*cipher = &aesni_256_ctr;
+		break;
 
 	default:
 		/* Sorry, we don't support this NID */