x86[_64] assembly pack: update benchmark results.
This commit is contained in:
parent
4b9e0b5f74
commit
d2e1803197
@ -23,14 +23,20 @@
|
|||||||
# AES-128-CBC +SHA1 stitch gain
|
# AES-128-CBC +SHA1 stitch gain
|
||||||
# Westmere 3.77[+5.6] 9.37 6.65 +41%
|
# Westmere 3.77[+5.6] 9.37 6.65 +41%
|
||||||
# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
|
# Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%)
|
||||||
|
# Ivy Bridge 5.05[+4.7] 9.75 5.59 +74%
|
||||||
|
# Bulldozer 5.77[+6.1] 11.87 6.47 +83%
|
||||||
#
|
#
|
||||||
# AES-192-CBC
|
# AES-192-CBC
|
||||||
# Westmere 4.51 10.11 6.97 +45%
|
# Westmere 4.51 10.11 6.97 +45%
|
||||||
# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
|
# Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%)
|
||||||
|
# Ivy Bridge 6.05 10.75 6.07 +77%
|
||||||
|
# Bulldozer 6.89 12.99 7.02 +85%
|
||||||
#
|
#
|
||||||
# AES-256-CBC
|
# AES-256-CBC
|
||||||
# Westmere 5.25 10.85 7.25 +50%
|
# Westmere 5.25 10.85 7.25 +50%
|
||||||
# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
|
# Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%)
|
||||||
|
# Ivy Bridge 7.05 11.75 7.12 +65%
|
||||||
|
# Bulldozer 8.00 14.10 8.24 +71%
|
||||||
#
|
#
|
||||||
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
|
||||||
# background information. Above numbers in parentheses are SSSE3
|
# background information. Above numbers in parentheses are SSSE3
|
||||||
@ -47,6 +53,8 @@
|
|||||||
# AES-128-CBC AES-192-CBC AES-256-CBC
|
# AES-128-CBC AES-192-CBC AES-256-CBC
|
||||||
# Westmere 1.31 1.55 1.80
|
# Westmere 1.31 1.55 1.80
|
||||||
# Sandy Bridge 0.93 1.06 1.22
|
# Sandy Bridge 0.93 1.06 1.22
|
||||||
|
# Ivy Bridge 0.92 1.06 1.21
|
||||||
|
# Bulldozer 0.76 0.90 1.04
|
||||||
|
|
||||||
$flavour = shift;
|
$flavour = shift;
|
||||||
$output = shift;
|
$output = shift;
|
||||||
|
@ -157,6 +157,13 @@
|
|||||||
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
|
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like
|
||||||
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
||||||
|
|
||||||
|
######################################################################
|
||||||
|
# For reference, AMD Bulldozer spends 5.77 cycles per byte processed
|
||||||
|
# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70
|
||||||
|
# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec]
|
||||||
|
# instruction latency is 9 cycles and that they can be issued every
|
||||||
|
# cycle.
|
||||||
|
|
||||||
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
||||||
# generates drop-in replacement for
|
# generates drop-in replacement for
|
||||||
# crypto/aes/asm/aes-x86_64.pl:-)
|
# crypto/aes/asm/aes-x86_64.pl:-)
|
||||||
|
@ -26,6 +26,8 @@
|
|||||||
# P4 125/125 17.8 84(***)
|
# P4 125/125 17.8 84(***)
|
||||||
# Opteron 66 /70 10.1 30
|
# Opteron 66 /70 10.1 30
|
||||||
# Core2 54 /67 8.4 18
|
# Core2 54 /67 8.4 18
|
||||||
|
# Atom 105/105 16.8 53
|
||||||
|
# VIA Nano 69 /71 13.0 27
|
||||||
#
|
#
|
||||||
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
||||||
# which is one of reasons why 2.95.3 results were chosen,
|
# which is one of reasons why 2.95.3 results were chosen,
|
||||||
@ -113,6 +115,10 @@
|
|||||||
# similar manner resulted in almost 20% degradation on Sandy Bridge,
|
# similar manner resulted in almost 20% degradation on Sandy Bridge,
|
||||||
# where original 64-bit code processes one byte in 1.95 cycles.
|
# where original 64-bit code processes one byte in 1.95 cycles.
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
# For reference, AMD Bulldozer processes one byte in 1.98 cycles in
|
||||||
|
# 32-bit mode and 1.89 in 64-bit.
|
||||||
|
|
||||||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||||||
push(@INC,"${dir}","${dir}../../perlasm");
|
push(@INC,"${dir}","${dir}../../perlasm");
|
||||||
require "x86asm.pl";
|
require "x86asm.pl";
|
||||||
|
@ -22,6 +22,8 @@
|
|||||||
# P4 28.6 14.0 +100%
|
# P4 28.6 14.0 +100%
|
||||||
# Opteron 19.3 7.7 +150%
|
# Opteron 19.3 7.7 +150%
|
||||||
# Core2 17.8 8.1(**) +120%
|
# Core2 17.8 8.1(**) +120%
|
||||||
|
# Atom 31.6 16.8 +88%
|
||||||
|
# VIA Nano 21.8 10.1 +115%
|
||||||
#
|
#
|
||||||
# (*) comparison is not completely fair, because C results are
|
# (*) comparison is not completely fair, because C results are
|
||||||
# for vanilla "256B" implementation, while assembler results
|
# for vanilla "256B" implementation, while assembler results
|
||||||
|
@ -43,6 +43,9 @@
|
|||||||
# Westmere 5.1/+94%(**)
|
# Westmere 5.1/+94%(**)
|
||||||
# Sandy Bridge 5.0/+8%
|
# Sandy Bridge 5.0/+8%
|
||||||
# Atom 12.6/+6%
|
# Atom 12.6/+6%
|
||||||
|
# VIA Nano 6.4/+9%
|
||||||
|
# Ivy Bridge 4.9/±0%
|
||||||
|
# Bulldozer 4.9/+15%
|
||||||
#
|
#
|
||||||
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
|
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
|
||||||
# but this specific code performs poorly on Core2. And vice
|
# but this specific code performs poorly on Core2. And vice
|
||||||
|
@ -30,6 +30,9 @@
|
|||||||
# Westmere 4.3 5.2 9.5 7.0 +36%
|
# Westmere 4.3 5.2 9.5 7.0 +36%
|
||||||
# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
|
# Sandy Bridge 4.2 5.5 9.7 6.8 +43%
|
||||||
# Atom 9.3 6.5 15.8 11.1 +42%
|
# Atom 9.3 6.5 15.8 11.1 +42%
|
||||||
|
# VIA Nano 6.3 5.4 11.7 8.6 +37%
|
||||||
|
# Ivy Bridge 4.1 5.2 9.3 6.0 +54%
|
||||||
|
# Bulldozer 4.5 5.4 9.9 7.7 +29%
|
||||||
#
|
#
|
||||||
# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
|
# (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement
|
||||||
# is +53%...
|
# is +53%...
|
||||||
|
@ -92,6 +92,9 @@
|
|||||||
# Westmere 4.2/+60%
|
# Westmere 4.2/+60%
|
||||||
# Sandy Bridge 4.2/+120%
|
# Sandy Bridge 4.2/+120%
|
||||||
# Atom 9.3/+80%
|
# Atom 9.3/+80%
|
||||||
|
# VIA Nano 6.4/+4%
|
||||||
|
# Ivy Bridge 4.1/+30%
|
||||||
|
# Bulldozer 4.5/+30%(*)
|
||||||
#
|
#
|
||||||
# (*) But corresponding loop has less instructions, which should have
|
# (*) But corresponding loop has less instructions, which should have
|
||||||
# positive effect on upcoming Bulldozer, which has one less ALU.
|
# positive effect on upcoming Bulldozer, which has one less ALU.
|
||||||
|
@ -92,6 +92,9 @@
|
|||||||
# Atom 12.5 9.5(*)/+32% -
|
# Atom 12.5 9.5(*)/+32% -
|
||||||
# Westmere 7.3 5.6/+30% -
|
# Westmere 7.3 5.6/+30% -
|
||||||
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
|
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
|
||||||
|
# Ivy Bridge 7.2 4.9/+47% 4.8(**)/+50%
|
||||||
|
# Bulldozer 11.6 6.2/+88%
|
||||||
|
# VIA Nano 10.6 7.5/+41%
|
||||||
#
|
#
|
||||||
# (*) Loop is 1056 instructions long and expected result is ~8.25.
|
# (*) Loop is 1056 instructions long and expected result is ~8.25.
|
||||||
# It remains mystery [to me] why ILP is limited to 1.7.
|
# It remains mystery [to me] why ILP is limited to 1.7.
|
||||||
|
@ -60,6 +60,9 @@
|
|||||||
# Atom 11.0 9.7/+13% -
|
# Atom 11.0 9.7/+13% -
|
||||||
# Westmere 7.1 5.6/+27% -
|
# Westmere 7.1 5.6/+27% -
|
||||||
# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
|
# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
|
||||||
|
# Ivy Bridge 6.4 4.8/+33% 4.7/+36%
|
||||||
|
# Bulldozer 10.9 6.1/+79%
|
||||||
|
# VIA Nano 10.2 7.4/+38%
|
||||||
|
|
||||||
$flavour = shift;
|
$flavour = shift;
|
||||||
$output = shift;
|
$output = shift;
|
||||||
|
@ -11,15 +11,16 @@
|
|||||||
#
|
#
|
||||||
# Performance in clock cycles per processed byte (less is better):
|
# Performance in clock cycles per processed byte (less is better):
|
||||||
#
|
#
|
||||||
# Pentium PIII P4 AMD K8 Core2
|
# PIII P4 AMD K8 Core2 SB Atom Bldzr
|
||||||
# gcc 100 75 116 54 66
|
# gcc 75 116 54 66 58 126 121
|
||||||
# icc 97 77 95 55 57
|
# icc 77 95 55 57 - - -
|
||||||
# x86 asm 61 56 82 36 40
|
# x86 asm 56 82 36 40 35 68 50
|
||||||
# SSE2 asm - - 38 24 20
|
# SSE2 asm - 38 24 20 16 64(**) 18
|
||||||
# x86_64 asm(*) - - 30 10.0 10.5
|
# x86_64 asm(*) - 33 9.6 10.3 11.3 14.7 13.5
|
||||||
#
|
#
|
||||||
# (*) x86_64 assembler performance is presented for reference
|
# (*) x86_64 assembler performance is presented for reference
|
||||||
# purposes.
|
# purposes.
|
||||||
|
# (**) paddq is increadibly slow on Atom.
|
||||||
#
|
#
|
||||||
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
|
# IALU code-path is optimized for elder Pentiums. On vanilla Pentium
|
||||||
# performance improvement over compiler generated code reaches ~60%,
|
# performance improvement over compiler generated code reaches ~60%,
|
||||||
@ -315,6 +316,7 @@ if ($sse2) {
|
|||||||
&bswap ("edx");
|
&bswap ("edx");
|
||||||
&mov (&DWP(8*9+4,"esp"),"ecx");
|
&mov (&DWP(8*9+4,"esp"),"ecx");
|
||||||
&mov (&DWP(8*9+0,"esp"),"edx");
|
&mov (&DWP(8*9+0,"esp"),"edx");
|
||||||
|
&jmp (&label("00_14_sse2"));
|
||||||
|
|
||||||
&set_label("00_14_sse2",16);
|
&set_label("00_14_sse2",16);
|
||||||
&mov ("eax",&DWP(0,"edi"));
|
&mov ("eax",&DWP(0,"edi"));
|
||||||
|
Loading…
x
Reference in New Issue
Block a user