aes/asm/bsaes-x86_64.pl: Atom-specific optimization.

2014-04-24 10:13:30 +02:00 · 2014-04-24 10:13:30 +02:00 · 558ff0f0c1
commit 558ff0f0c1
parent 94d1f4b0f3
1 changed files with 32 additions and 40 deletions
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@ -38,8 +38,8 @@
 #		Emilia's	this(*)		difference
 #
 # Core 2    	9.30		8.69		+7%
-# Nehalem(**) 	7.63		6.98		+9%
+# Nehalem(**) 	7.63		6.88		+11%
-# Atom	    	17.1		17.4		-2%(***)
+# Atom	    	17.1		16.4		+4%
 #
 # (*)	Comparison is not completely fair, because "this" is ECB,
 #	i.e. no extra processing such as counter values calculation
@ -50,14 +50,6 @@
 # (**)	Results were collected on Westmere, which is considered to
 #	be equivalent to Nehalem for this code.
 #
 # (***)	Slowdown on Atom is rather strange per se, because original
 #	implementation has a number of 9+-bytes instructions, which
 #	are bad for Atom front-end, and which I eliminated completely.
 #	In attempt to address deterioration sbox() was tested in FP
 #	SIMD "domain" (movaps instead of movdqa, xorps instead of
 #	pxor, etc.). While it resulted in nominal 4% improvement on
 #	Atom, it hurted Westmere by more than 2x factor.
 #
 # As for key schedule conversion subroutine. Interface to OpenSSL
 # relies on per-invocation on-the-fly conversion. This naturally
 # has impact on performance, especially for short inputs. Conversion
@ -67,7 +59,7 @@
 # 		conversion	conversion/8x block
 # Core 2	240		0.22
 # Nehalem	180		0.20
-# Atom		430		0.19
+# Atom		430		0.20
 #
 # The ratio values mean that 128-byte blocks will be processed
 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@ -83,9 +75,9 @@
 # Add decryption procedure. Performance in CPU cycles spent to decrypt
 # one byte out of 4096-byte buffer with 128-bit key is:
 #
-# Core 2	9.83
+# Core 2	9.98
-# Nehalem	7.74
+# Nehalem	7.80
-# Atom		19.0
+# Atom		17.9
 #
 # November 2011.
 #
@ -434,21 +426,21 @@ my $mask=pop;
 $code.=<<___;
 	pxor	0x00($key),@x[0]
 	pxor	0x10($key),@x[1]
 	pshufb	$mask,@x[0]
 	pxor	0x20($key),@x[2]
 	pshufb	$mask,@x[1]
 	pxor	0x30($key),@x[3]
-	pshufb	$mask,@x[2]
+	pshufb	$mask,@x[0]
 	pshufb	$mask,@x[1]
 	pxor	0x40($key),@x[4]
 	pshufb	$mask,@x[3]
 	pxor	0x50($key),@x[5]
-	pshufb	$mask,@x[4]
+	pshufb	$mask,@x[2]
 	pshufb	$mask,@x[3]
 	pxor	0x60($key),@x[6]
 	pshufb	$mask,@x[5]
 	pxor	0x70($key),@x[7]
 	pshufb	$mask,@x[4]
 	pshufb	$mask,@x[5]
 	pshufb	$mask,@x[6]
 	lea	0x80($key),$key
 	pshufb	$mask,@x[7]
 	lea	0x80($key),$key
 ___
 }
@ -820,18 +812,18 @@ _bsaes_encrypt8:
 	movdqa	0x50($const), @XMM[8]	# .LM0SR
 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
 	pxor	@XMM[9], @XMM[1]
 	 pshufb	@XMM[8], @XMM[0]
 	pxor	@XMM[9], @XMM[2]
 	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[0]
 	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[4]
 	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[2]
 	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[6]
 	 pshufb	@XMM[8], @XMM[5]
 	pxor	@XMM[9], @XMM[7]
 	 pshufb	@XMM[8], @XMM[4]
 	 pshufb	@XMM[8], @XMM[5]
 	 pshufb	@XMM[8], @XMM[6]
 	 pshufb	@XMM[8], @XMM[7]
 _bsaes_encrypt8_bitslice:
@ -884,18 +876,18 @@ _bsaes_decrypt8:
 	movdqa	-0x30($const), @XMM[8]	# .LM0ISR
 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
 	pxor	@XMM[9], @XMM[1]
 	 pshufb	@XMM[8], @XMM[0]
 	pxor	@XMM[9], @XMM[2]
 	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[0]
 	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[4]
 	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[2]
 	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[6]
 	 pshufb	@XMM[8], @XMM[5]
 	pxor	@XMM[9], @XMM[7]
 	 pshufb	@XMM[8], @XMM[4]
 	 pshufb	@XMM[8], @XMM[5]
 	 pshufb	@XMM[8], @XMM[6]
 	 pshufb	@XMM[8], @XMM[7]
 ___
@ -1937,21 +1929,21 @@ $code.=<<___;
 	movdqa	-0x10(%r11), @XMM[8]	# .LSWPUPM0SR
 	pxor	@XMM[9], @XMM[0]	# xor with round0 key
 	pxor	@XMM[9], @XMM[1]
 	 pshufb	@XMM[8], @XMM[0]
 	pxor	@XMM[9], @XMM[2]
 	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[3]
-	 pshufb	@XMM[8], @XMM[2]
+	 pshufb	@XMM[8], @XMM[0]
 	 pshufb	@XMM[8], @XMM[1]
 	pxor	@XMM[9], @XMM[4]
 	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[5]
-	 pshufb	@XMM[8], @XMM[4]
+	 pshufb	@XMM[8], @XMM[2]
 	 pshufb	@XMM[8], @XMM[3]
 	pxor	@XMM[9], @XMM[6]
 	 pshufb	@XMM[8], @XMM[5]
 	pxor	@XMM[9], @XMM[7]
 	 pshufb	@XMM[8], @XMM[4]
 	 pshufb	@XMM[8], @XMM[5]
 	 pshufb	@XMM[8], @XMM[6]
 	lea	.LBS0(%rip), %r11	# constants table
 	 pshufb	@XMM[8], @XMM[7]
 	lea	.LBS0(%rip), %r11	# constants table
 	mov	%ebx,%r10d		# pass rounds
 	call	_bsaes_encrypt8_bitslice