x86[_64] assembly pack: update benchmark results.

2012-06-12 14:18:21 +00:00 · 2012-06-12 14:18:21 +00:00 · d2e1803197
commit d2e1803197
parent 4b9e0b5f74
10 changed files with 48 additions and 8 deletions
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@ -23,14 +23,20 @@
 #		AES-128-CBC	+SHA1		stitch      gain
 # Westmere	3.77[+5.6]	9.37		6.65	    +41%
 # Sandy Bridge	5.05[+5.2(6.3)]	10.25(11.35)	6.16(7.08)  +67%(+60%)
+# Ivy Bridge	5.05[+4.7]	9.75		5.59        +74%
+# Bulldozer	5.77[+6.1]	11.87		6.47        +83%
 #
 #		AES-192-CBC
 # Westmere	4.51		10.11		6.97	    +45%
 # Sandy Bridge	6.05		11.25(12.35)	6.34(7.27)  +77%(+70%)
+# Ivy Bridge	6.05		10.75		6.07        +77%
+# Bulldozer	6.89		12.99		7.02        +85%
 #
 #		AES-256-CBC
 # Westmere	5.25		10.85		7.25	    +50%
 # Sandy Bridge	7.05		12.25(13.35)	7.06(7.70)  +74%(+73%)
+# Ivy Bridge	7.05		11.75		7.12        +65%
+# Bulldozer	8.00		14.10		8.24        +71%
 #
 # (*)	There are two code paths: SSSE3 and AVX. See sha1-568.pl for
 #	background information. Above numbers in parentheses are SSSE3
@ -47,6 +53,8 @@
 #		AES-128-CBC	AES-192-CBC	AES-256-CBC
 # Westmere	1.31		1.55		1.80
 # Sandy Bridge	0.93		1.06		1.22
+# Ivy Bridge	0.92		1.06		1.21
+# Bulldozer	0.76		0.90		1.04

 $flavour = shift;
 $output  = shift;
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@ -157,6 +157,13 @@
 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
 # in CTR mode AES instruction interleave factor was chosen to be 6x.

+######################################################################
+# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
+# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
+# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec]
+# instruction latency is 9 cycles and that they can be issued every
+# cycle.
+
 $PREFIX="aesni";	# if $PREFIX is set to "AES", the script
 			# generates drop-in replacement for
 			# crypto/aes/asm/aes-x86_64.pl:-)
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@ -26,6 +26,8 @@
 # P4		125/125		17.8		84(***)
 # Opteron	66 /70		10.1		30
 # Core2		54 /67		8.4		18
+# Atom		105/105		16.8		53
+# VIA Nano	69 /71		13.0		27
 #
 # (*)	gcc 3.4.x was observed to generate few percent slower code,
 #	which is one of reasons why 2.95.3 results were chosen,
@ -113,6 +115,10 @@
 # similar manner resulted in almost 20% degradation on Sandy Bridge,
 # where original 64-bit code processes one byte in 1.95 cycles.

+#####################################################################
+# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
+# 32-bit mode and 1.89 in 64-bit.
+
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@ -22,6 +22,8 @@
 # P4		28.6		14.0		+100%
 # Opteron	19.3		7.7		+150%
 # Core2		17.8		8.1(**)		+120%
+# Atom		31.6		16.8		+88%
+# VIA Nano	21.8		10.1		+115%
 #
 # (*)	comparison is not completely fair, because C results are
 #	for vanilla "256B" implementation, while assembler results
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@ -43,6 +43,9 @@
 # Westmere	5.1/+94%(**)
 # Sandy Bridge	5.0/+8%
 # Atom		12.6/+6%
+# VIA Nano	6.4/+9%
+# Ivy Bridge	4.9/±0%
+# Bulldozer	4.9/+15%
 #
 # (*)	PIII can actually deliver 6.6 cycles per byte with MMX code,
 #	but this specific code performs poorly on Core2. And vice
--- a/crypto/rc4/asm/rc4-md5-x86_64.pl
+++ b/crypto/rc4/asm/rc4-md5-x86_64.pl
@ -30,6 +30,9 @@
 # Westmere	4.3	5.2	9.5	7.0	+36%
 # Sandy Bridge	4.2	5.5	9.7	6.8	+43%
 # Atom		9.3	6.5	15.8	11.1	+42%
+# VIA Nano	6.3	5.4	11.7	8.6	+37%
+# Ivy Bridge	4.1	5.2	9.3	6.0	+54%
+# Bulldozer	4.5	5.4	9.9	7.7	+29%
 #
 # (*)	rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
 #	is +53%...
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@ -92,6 +92,9 @@
 # Westmere	4.2/+60%
 # Sandy Bridge	4.2/+120%
 # Atom		9.3/+80%
+# VIA Nano	6.4/+4%
+# Ivy Bridge	4.1/+30%
+# Bulldozer	4.5/+30%(*)
 #
 # (*)	But corresponding loop has less instructions, which should have
 #	positive effect on upcoming Bulldozer, which has one less ALU.
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@ -92,6 +92,9 @@
 # Atom		12.5		9.5(*)/+32%	-
 # Westmere	7.3		5.6/+30%	-
 # Sandy Bridge	8.8		6.2/+40%	5.1(**)/+70%
+# Ivy Bridge	7.2		4.9/+47%	4.8(**)/+50%
+# Bulldozer	11.6		6.2/+88%
+# VIA Nano	10.6		7.5/+41%
 #
 # (*)	Loop is 1056 instructions long and expected result is ~8.25.
 #	It remains mystery [to me] why ILP is limited to 1.7.
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@ -60,6 +60,9 @@
 # Atom		11.0		9.7/+13%	-
 # Westmere	7.1		5.6/+27%	-
 # Sandy Bridge	7.9		6.3/+25%	5.2/+51%
+# Ivy Bridge	6.4		4.8/+33%	4.7/+36%
+# Bulldozer	10.9		6.1/+79%
+# VIA Nano	10.2		7.4/+38%

 $flavour = shift;
 $output  = shift;
--- a/crypto/sha/asm/sha512-586.pl
+++ b/crypto/sha/asm/sha512-586.pl
@ -11,15 +11,16 @@
 #
 # Performance in clock cycles per processed byte (less is better):
 #
-#		Pentium	PIII	P4	AMD K8	Core2
-# gcc		100	75	116	54	66
-# icc		97	77	95	55	57
-# x86 asm	61	56	82	36	40
-# SSE2 asm	-	-	38	24	20
-# x86_64 asm(*)	-	-	30	10.0	10.5
+#		PIII	P4	AMD K8	Core2	SB	Atom	Bldzr
+# gcc		75	116	54	66	58	126	121
+# icc		77	95	55	57	-	-	-
+# x86 asm	56	82	36	40	35	68	50
+# SSE2 asm	-	38	24	20	16	64(**)	18
+# x86_64 asm(*)	-	33	9.6	10.3	11.3	14.7	13.5
 #
-# (*) x86_64 assembler performance is presented for reference
-#     purposes.
+# (*)	x86_64 assembler performance is presented for reference
+#	purposes.
+# (**)	paddq is increadibly slow on Atom.
 #
 # IALU code-path is optimized for elder Pentiums. On vanilla Pentium
 # performance improvement over compiler generated code reaches ~60%,
@ -315,6 +316,7 @@ if ($sse2) {
 	&bswap	("edx");
 	&mov	(&DWP(8*9+4,"esp"),"ecx");
 	&mov	(&DWP(8*9+0,"esp"),"edx");
+	&jmp	(&label("00_14_sse2"));

 &set_label("00_14_sse2",16);
 	&mov	("eax",&DWP(0,"edi"));