x86[_64] assembly pack: update benchmark results.
This commit is contained in:
parent
4b9e0b5f74
commit
d2e1803197
@ -23,14 +23,20 @@
|
||||
# AES-128-CBC +SHA1 stitch gain
|
||||
# Westmere 3.77[+5.6] 9.37 6.65 +41%
|
||||
# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
|
||||
# Ivy Bridge 5.05[+4.7] 9.75 5.59 +74%
|
||||
# Bulldozer 5.77[+6.1] 11.87 6.47 +83%
|
||||
#
|
||||
# AES-192-CBC
|
||||
# Westmere 4.51 10.11 6.97 +45%
|
||||
# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
|
||||
# Ivy Bridge 6.05 10.75 6.07 +77%
|
||||
# Bulldozer 6.89 12.99 7.02 +85%
|
||||
#
|
||||
# AES-256-CBC
|
||||
# Westmere 5.25 10.85 7.25 +50%
|
||||
# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
|
||||
# Ivy Bridge 7.05 11.75 7.12 +65%
|
||||
# Bulldozer 8.00 14.10 8.24 +71%
|
||||
#
|
||||
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
||||
# background information. Above numbers in parentheses are SSSE3
|
||||
@ -47,6 +53,8 @@
|
||||
# AES-128-CBC AES-192-CBC AES-256-CBC
|
||||
# Westmere 1.31 1.55 1.80
|
||||
# Sandy Bridge 0.93 1.06 1.22
|
||||
# Ivy Bridge 0.92 1.06 1.21
|
||||
# Bulldozer 0.76 0.90 1.04
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
@ -157,6 +157,13 @@
|
||||
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
|
||||
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
||||
|
||||
######################################################################
|
||||
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
||||
# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
|
||||
# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec]
|
||||
# instruction latency is 9 cycles and that they can be issued every
|
||||
# cycle.
|
||||
|
||||
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
||||
# generates drop-in replacement for
|
||||
# crypto/aes/asm/aes-x86_64.pl:-)
|
||||
|
@ -26,6 +26,8 @@
|
||||
# P4 125/125 17.8 84(***)
|
||||
# Opteron 66 /70 10.1 30
|
||||
# Core2 54 /67 8.4 18
|
||||
# Atom 105/105 16.8 53
|
||||
# VIA Nano 69 /71 13.0 27
|
||||
#
|
||||
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
||||
# which is one of reasons why 2.95.3 results were chosen,
|
||||
@ -113,6 +115,10 @@
|
||||
# similar manner resulted in almost 20% degradation on Sandy Bridge,
|
||||
# where original 64-bit code processes one byte in 1.95 cycles.
|
||||
|
||||
#####################################################################
|
||||
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
|
||||
# 32-bit mode and 1.89 in 64-bit.
|
||||
|
||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||
push(@INC,"${dir}","${dir}../../perlasm");
|
||||
require "x86asm.pl";
|
||||
|
@ -22,6 +22,8 @@
|
||||
# P4 28.6 14.0 +100%
|
||||
# Opteron 19.3 7.7 +150%
|
||||
# Core2 17.8 8.1(**) +120%
|
||||
# Atom 31.6 16.8 +88%
|
||||
# VIA Nano 21.8 10.1 +115%
|
||||
#
|
||||
# (*) comparison is not completely fair, because C results are
|
||||
# for vanilla "256B" implementation, while assembler results
|
||||
|
@ -43,6 +43,9 @@
|
||||
# Westmere 5.1/+94%(**)
|
||||
# Sandy Bridge 5.0/+8%
|
||||
# Atom 12.6/+6%
|
||||
# VIA Nano 6.4/+9%
|
||||
# Ivy Bridge 4.9/±0%
|
||||
# Bulldozer 4.9/+15%
|
||||
#
|
||||
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
|
||||
# but this specific code performs poorly on Core2. And vice
|
||||
|
@ -30,6 +30,9 @@
|
||||
# Westmere 4.3 5.2 9.5 7.0 +36%
|
||||
# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
|
||||
# Atom 9.3 6.5 15.8 11.1 +42%
|
||||
# VIA Nano 6.3 5.4 11.7 8.6 +37%
|
||||
# Ivy Bridge 4.1 5.2 9.3 6.0 +54%
|
||||
# Bulldozer 4.5 5.4 9.9 7.7 +29%
|
||||
#
|
||||
# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
|
||||
# is +53%...
|
||||
|
@ -92,6 +92,9 @@
|
||||
# Westmere 4.2/+60%
|
||||
# Sandy Bridge 4.2/+120%
|
||||
# Atom 9.3/+80%
|
||||
# VIA Nano 6.4/+4%
|
||||
# Ivy Bridge 4.1/+30%
|
||||
# Bulldozer 4.5/+30%(*)
|
||||
#
|
||||
# (*) But corresponding loop has less instructions, which should have
|
||||
# positive effect on upcoming Bulldozer, which has one less ALU.
|
||||
|
@ -92,6 +92,9 @@
|
||||
# Atom 12.5 9.5(*)/+32% -
|
||||
# Westmere 7.3 5.6/+30% -
|
||||
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
|
||||
# Ivy Bridge 7.2 4.9/+47% 4.8(**)/+50%
|
||||
# Bulldozer 11.6 6.2/+88%
|
||||
# VIA Nano 10.6 7.5/+41%
|
||||
#
|
||||
# (*) Loop is 1056 instructions long and expected result is ~8.25.
|
||||
# It remains mystery [to me] why ILP is limited to 1.7.
|
||||
|
@ -60,6 +60,9 @@
|
||||
# Atom 11.0 9.7/+13% -
|
||||
# Westmere 7.1 5.6/+27% -
|
||||
# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
|
||||
# Ivy Bridge 6.4 4.8/+33% 4.7/+36%
|
||||
# Bulldozer 10.9 6.1/+79%
|
||||
# VIA Nano 10.2 7.4/+38%
|
||||
|
||||
$flavour = shift;
|
||||
$output = shift;
|
||||
|
@ -11,15 +11,16 @@
|
||||
#
|
||||
# Performance in clock cycles per processed byte (less is better):
|
||||
#
|
||||
# Pentium PIII P4 AMD K8 Core2
|
||||
# gcc 100 75 116 54 66
|
||||
# icc 97 77 95 55 57
|
||||
# x86 asm 61 56 82 36 40
|
||||
# SSE2 asm - - 38 24 20
|
||||
# x86_64 asm(*) - - 30 10.0 10.5
|
||||
# PIII P4 AMD K8 Core2 SB Atom Bldzr
|
||||
# gcc 75 116 54 66 58 126 121
|
||||
# icc 77 95 55 57 - - -
|
||||
# x86 asm 56 82 36 40 35 68 50
|
||||
# SSE2 asm - 38 24 20 16 64(**) 18
|
||||
# x86_64 asm(*) - 33 9.6 10.3 11.3 14.7 13.5
|
||||
#
|
||||
# (*) x86_64 assembler performance is presented for reference
|
||||
# purposes.
|
||||
# (*) x86_64 assembler performance is presented for reference
|
||||
# purposes.
|
||||
# (**) paddq is increadibly slow on Atom.
|
||||
#
|
||||
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
|
||||
# performance improvement over compiler generated code reaches ~60%,
|
||||
@ -315,6 +316,7 @@ if ($sse2) {
|
||||
&bswap ("edx");
|
||||
&mov (&DWP(8*9+4,"esp"),"ecx");
|
||||
&mov (&DWP(8*9+0,"esp"),"edx");
|
||||
&jmp (&label("00_14_sse2"));
|
||||
|
||||
&set_label("00_14_sse2",16);
|
||||
&mov ("eax",&DWP(0,"edi"));
|
||||
|
Loading…
x
Reference in New Issue
Block a user