aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont.

Improve CBC decrypt and CTR by ~13/16%, which adds up to ~25/33% improvement over "pre-Silvermont" version. [Add performance table to aesni-x86.pl].
2014-02-14 17:06:15 +01:00 · 2014-02-14 17:06:15 +01:00 · 5599c7331b
commit 5599c7331b
parent 385b348666
2 changed files with 179 additions and 21 deletions
--- a/crypto/aes/asm/aesni-x86.pl
+++ b/crypto/aes/asm/aesni-x86.pl
@ -43,6 +43,17 @@
 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
 ######################################################################
 # Current large-block performance in cycles per byte processed with
 # 128-bit key (less is better).
 #
 #		CBC en-/decrypt	CTR	XTS	ECB
 # Westmere	3.77/1.37	1.37	1.52	1.27
 # * Bridge	5.07/0.98	0.99	1.09	0.91
 # Haswell	4.44/0.80	0.97	1.03	0.72
 # Atom		5.77/3.56	3.67	4.03	3.46
 # Bulldozer	5.80/0.98	1.05	1.24	0.93
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
 			# crypto/aes/asm/aes-586.pl:-)
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@ -158,25 +158,19 @@
 # in CTR mode AES instruction interleave factor was chosen to be 6x.
 ######################################################################
-# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
+# Current large-block performance in cycles per byte processed with
-# with 128-bit key in CBC encrypt and 0.70 cycles in CBC decrypt, 0.70
+# 128-bit key (less is better).
-# in ECB, 0.71 in CTR, 0.90 in XTS... This means that aes[enc|dec]
+#
-# instruction latency is 9 cycles and that they can be issued every
+#		CBC en-/decrypt	CTR	XTS	ECB
-# cycle.
+# Westmere	3.77/1.25	1.25	1.25	1.26
-
+# * Bridge	5.07/0.74	0.75	0.90	0.85
-######################################################################
+# Haswell	4.44/0.63	0.63	0.73	0.63
-# Haswell spends 4.44 cycles per byte in CBC encrypt, 0.63 in CBC
+# Atom		5.75/3.54	3.56	4.12	3.87(*)
-# decrypt, CTR and ECB, 0.73 in XTS.
+# Bulldozer	5.77/0.70	0.72	0.90	0.70
-
+#
-######################################################################
+# (*)	Atom ECB result is suboptimal because of penalties incurred
-# Atom Silvermont spends 5.77/4.0 cycles per byte in CBC en-/decrypt,
+#	by operations on %xmm8-15. As ECB is not considered
-# 3.87 in ECB, 4.15 in CTR, 4.12 in XTS. Results for parallelizeable
+#	critical, nothing was done to mitigate the problem.
 # modes [other than XTS] are actually suboptimal, because of penalties
 # incurred by operations on %xmm8-15, which are inevitable with such
 # high instruction interleave factors. This means that performance can
 # be improved by decreasing the interleave factor, but then it would
 # negatively affect other platforms in relatively larger degree.
 # Run-time detection would solve the dilemma...
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
@ -201,6 +195,7 @@ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
 		("%rdi","%rsi","%rdx","%rcx");	# Unix order
 $code=".text\n";
 $code.=".extern	OPENSSL_ia32cap_P\n";
 $rounds="%eax";	# input to and changed by aesni_[en|de]cryptN !!!
 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
@ -1119,7 +1114,9 @@ $code.=<<___;
 	lea	7($ctr),%r9
 	 mov	%r10d,0x60+12(%rsp)
 	bswap	%r9d
 	 mov	OPENSSL_ia32cap_P+4(%rip),%r10d 
 	xor	$key0,%r9d
 	 and	\$`1<<26|1<<22`,%r10d		# isolate XSAVE+MOVBE
 	mov	%r9d,0x70+12(%rsp)
 	$movkey	0x10($key),$rndkey1
@ -1130,10 +1127,104 @@ $code.=<<___;
 	cmp	\$8,$len
 	jb	.Lctr32_tail
 	sub	\$6,$len
 	cmp	\$`1<<22`,%r10d		# check for MOVBE without XSAVE
 	je	.Lctr32_6x
 	lea	0x80($key),$key		# size optimization
-	sub	\$8,$len
+	sub	\$2,$len
 	jmp	.Lctr32_loop8
 .align	16
 .Lctr32_6x:
 	shl	\$4,$rounds
 	mov	\$48,$rnds_
 	bswap	$key0
 	lea	32($key,$rounds),$key	# end of key schedule
 	sub	%rax,%r10		# twisted $rounds
 	jmp	.Lctr32_loop6
 .align	16
 .Lctr32_loop6:
 	 add	\$6,$ctr
 	$movkey	-48($key,$rnds_),$rndkey0
 	aesenc	$rndkey1,$inout0
 	 mov	$ctr,%eax
 	 xor	$key0,%eax
 	aesenc	$rndkey1,$inout1
 	 movbe	%eax,`0x00+12`(%rsp)
 	 lea	1($ctr),%eax
 	aesenc	$rndkey1,$inout2
 	 xor	$key0,%eax
 	 movbe	%eax,`0x10+12`(%rsp)
 	aesenc	$rndkey1,$inout3
 	 lea	2($ctr),%eax
 	 xor	$key0,%eax
 	aesenc	$rndkey1,$inout4
 	 movbe	%eax,`0x20+12`(%rsp)
 	 lea	3($ctr),%eax
 	aesenc	$rndkey1,$inout5
 	$movkey	-32($key,$rnds_),$rndkey1
 	 xor	$key0,%eax
 	aesenc	$rndkey0,$inout0
 	 movbe	%eax,`0x30+12`(%rsp)
 	 lea	4($ctr),%eax
 	aesenc	$rndkey0,$inout1
 	 xor	$key0,%eax
 	 movbe	%eax,`0x40+12`(%rsp)
 	aesenc	$rndkey0,$inout2
 	 lea	5($ctr),%eax
 	 xor	$key0,%eax
 	aesenc	$rndkey0,$inout3
 	 movbe	%eax,`0x50+12`(%rsp)
 	 mov	%r10,%rax		# mov	$rnds_,$rounds
 	aesenc	$rndkey0,$inout4
 	aesenc	$rndkey0,$inout5
 	$movkey	-16($key,$rnds_),$rndkey0
 	call	.Lenc_loop6
 	movdqu	($inp),$inout6
 	movdqu	0x10($inp),$inout7
 	movdqu	0x20($inp),$in0
 	movdqu	0x30($inp),$in1
 	movdqu	0x40($inp),$in2
 	movdqu	0x50($inp),$in3
 	lea	0x60($inp),$inp
 	$movkey	-64($key,$rnds_),$rndkey1
 	pxor	$inout0,$inout6
 	movaps	0x00(%rsp),$inout0
 	pxor	$inout1,$inout7
 	movaps	0x10(%rsp),$inout1
 	pxor	$inout2,$in0
 	movaps	0x20(%rsp),$inout2
 	pxor	$inout3,$in1
 	movaps	0x30(%rsp),$inout3
 	pxor	$inout4,$in2
 	movaps	0x40(%rsp),$inout4
 	pxor	$inout5,$in3
 	movaps	0x50(%rsp),$inout5
 	movdqu	$inout6,($out)
 	movdqu	$inout7,0x10($out)
 	movdqu	$in0,0x20($out)
 	movdqu	$in1,0x30($out)
 	movdqu	$in2,0x40($out)
 	movdqu	$in3,0x50($out)
 	lea	0x60($out),$out
 	sub	\$6,$len
 	jnc	.Lctr32_loop6
 	add	\$6,$len
 	jz	.Lctr32_done
 	lea	-48($rnds_),$rounds
 	lea	-80($key,$rnds_),$key	# restore $key
 	neg	$rounds
 	shr	\$4,$rounds		# restore $rounds
 	jmp	.Lctr32_tail
 .align	32
 .Lctr32_loop8:
 	 add		\$8,$ctr
@ -2455,10 +2546,15 @@ $code.=<<___;
 	movdqa	$inout3,$in3
 	movdqu	0x50($inp),$inout5
 	movdqa	$inout4,$in4
 	mov	OPENSSL_ia32cap_P+4(%rip),%r9d
 	cmp	\$0x70,$len
 	jbe	.Lcbc_dec_six_or_seven
-	sub	\$0x70,$len
+	and	\$`1<<26|1<<22`,%r9d	# isolate XSAVE+MOVBE	
 	sub	\$0x50,$len
 	cmp	\$`1<<22`,%r9d		# check for MOVBE without XSAVE
 	je	.Lcbc_dec_loop6_enter
 	sub	\$0x20,$len
 	lea	0x70($key),$key		# size optimization
 	jmp	.Lcbc_dec_loop8_enter
 .align	16
@ -2638,6 +2734,51 @@ $code.=<<___;
 	movdqa	$inout6,$inout0
 	jmp	.Lcbc_dec_tail_collected
 .align	16
 .Lcbc_dec_loop6:
 	movups	$inout5,($out)
 	lea	0x10($out),$out
 	movdqu	0x00($inp),$inout0	# load input
 	movdqu	0x10($inp),$inout1
 	movdqa	$inout0,$in0
 	movdqu	0x20($inp),$inout2
 	movdqa	$inout1,$in1
 	movdqu	0x30($inp),$inout3
 	movdqa	$inout2,$in2
 	movdqu	0x40($inp),$inout4
 	movdqa	$inout3,$in3
 	movdqu	0x50($inp),$inout5
 	movdqa	$inout4,$in4
 .Lcbc_dec_loop6_enter:
 	lea	0x60($inp),$inp
 	movdqa	$inout5,$inout6
 	call	_aesni_decrypt6
 	pxor	$iv,$inout0		# ^= IV
 	movdqa	$inout6,$iv
 	pxor	$in0,$inout1
 	movdqu	$inout0,($out)
 	pxor	$in1,$inout2
 	movdqu	$inout1,0x10($out)
 	pxor	$in2,$inout3
 	movdqu	$inout2,0x20($out)
 	pxor	$in3,$inout4
 	mov	$key_,$key
 	movdqu	$inout3,0x30($out)
 	pxor	$in4,$inout5
 	mov	$rnds_,$rounds
 	movdqu	$inout4,0x40($out)
 	lea	0x50($out),$out
 	sub	\$0x60,$len
 	ja	.Lcbc_dec_loop6
 	movdqa	$inout5,$inout0
 	add	\$0x50,$len
 	jle	.Lcbc_dec_tail_collected
 	movups	$inout5,($out)
 	lea	0x10($out),$out
 .Lcbc_dec_tail:
 	movups	($inp),$inout0
 	sub	\$0x10,$len
@ -3360,8 +3501,14 @@ sub aesni {
    return $line;
 }
 sub movbe {
 	".byte	0x0f,0x38,0xf1,0x44,0x24,".shift;
 }
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm;	# debugging artefact
 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
 print $code;