sha1-[586|x86_64].pl: shave off one instruction from body_40_59, it's

2% less instructions in SIMD code paths, so 2% improvement in average:-)
This commit is contained in:
Andy Polyakov 2012-07-15 20:33:30 +00:00
parent 7bb98eee3c
commit 69f45c520c
2 changed files with 30 additions and 28 deletions

View File

@ -89,12 +89,12 @@
# P4 10.6 -
# AMD K8 7.1 -
# Core2 7.3 6.1/+20% -
# Atom 12.5 9.5(*)/+32% -
# Westmere 7.3 5.6/+30% -
# Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70%
# Ivy Bridge 7.2 4.9/+47% 4.8(**)/+50%
# Bulldozer 11.6 6.2/+88%
# VIA Nano 10.6 7.5/+41%
# Atom 12.5 9.3(*)/+35% -
# Westmere 7.3 5.5/+33% -
# Sandy Bridge 8.8 6.2/+40% 5.2(**)/+70%
# Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53%
# Bulldozer 11.6 6.0/+92%
# VIA Nano 10.6 7.6/+40%
#
# (*) Loop is 1056 instructions long and expected result is ~8.25.
# It remains mystery [to me] why ILP is limited to 1.7.
@ -616,7 +616,7 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&movdqa (@X[2],@X[-1&7]) if ($Xi==8);
@ -783,17 +783,16 @@ sub body_20_39 () {
sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'.
'&mov (@T[1],$c);',
'&xor ($c,$d);',
'&xor (@T[0],$c);',
'&xor (@T[1],$d);',
'&add ($e,&DWP(4*($j++&15),"esp"));', # X[]+K xfer
'&and (@T[1],$d);',
'&and (@T[0],$c);', # ($b&($c^$d))
'&and (@T[0],@T[1]);',
'&$_ror ($b,7);', # $b>>>2
'&add ($e,@T[1]);',
'&xor (@T[0],$c);',
'&mov (@T[1],$a);', # $b in next round
'&$_rol ($a,5);',
'&add ($e,@T[0]);',
'&xor ($c,$d);', # restore $c
'&mov (@T[0],$b);', # copy of $c in next round
'&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
@ -809,6 +808,7 @@ sub body_40_59 () {
&Xupdate_ssse3_32_79(\&body_20_39);
&Xupdate_ssse3_32_79(\&body_20_39);
&Xupdate_ssse3_32_79(\&body_20_39);
&mov (@T[1],@V[2]); # copy of $c in next round
&Xupdate_ssse3_32_79(\&body_40_59);
&Xupdate_ssse3_32_79(\&body_40_59);
&Xupdate_ssse3_32_79(\&body_40_59);
@ -1032,7 +1032,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
@ -1173,6 +1173,7 @@ sub Xtail_avx()
&Xupdate_avx_32_79(\&body_20_39);
&Xupdate_avx_32_79(\&body_20_39);
&Xupdate_avx_32_79(\&body_20_39);
&mov (@T[1],@V[2]); # copy of $c in next round
&Xupdate_avx_32_79(\&body_40_59);
&Xupdate_avx_32_79(\&body_40_59);
&Xupdate_avx_32_79(\&body_40_59);

View File

@ -56,12 +56,12 @@
# x86_64 SSSE3 AVX
# P4 9.8 -
# Opteron 6.6 -
# Core2 6.7 6.1/+10% -
# Atom 11.0 9.7/+13% -
# Westmere 7.1 5.6/+27% -
# Sandy Bridge 7.9 6.3/+25% 5.2/+51%
# Ivy Bridge 6.4 4.8/+33% 4.7/+36%
# Bulldozer 10.9 6.1/+79%
# Core2 6.7 6.2/+8% -
# Atom 11.0 9.5/+15% -
# Westmere 7.1 5.5/+29% -
# Sandy Bridge 7.9 6.2/+28% 5.1/+54%
# Ivy Bridge 6.4 4.7/+35% 4.6/+37%
# Bulldozer 10.9 6.0/+82%
# VIA Nano 10.2 7.4/+38%
$flavour = shift;
@ -453,7 +453,7 @@ sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
sub Xupdate_ssse3_32_79()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&movdqa (@Tx[0],@X[-1&7]) if ($Xi==8);
@ -618,17 +618,16 @@ sub body_20_39 () {
sub body_40_59 () {
(
'($a,$b,$c,$d,$e)=@V;'.
'&mov (@T[1],$c);',
'&xor ($c,$d);',
'&xor (@T[0],$c);',
'&xor (@T[1],$d);',
'&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer
'&and (@T[1],$d);',
'&and (@T[0],$c);', # ($b&($c^$d))
'&and (@T[0],$T[1]);',
'&$_ror ($b,7);', # $b>>>2
'&add ($e,@T[1]);',
'&xor (@T[0],$c);',
'&mov (@T[1],$a);', # $b in next round
'&$_rol ($a,5);',
'&add ($e,@T[0]);',
'&xor ($c,$d);', # restore $c
'&mov (@T[0],$b);', # copy of $c in next round
'&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));'
);
}
@ -646,6 +645,7 @@ ___
&Xupdate_ssse3_32_79(\&body_20_39);
&Xupdate_ssse3_32_79(\&body_20_39);
&Xupdate_ssse3_32_79(\&body_20_39);
&mov (@T[1],@V[2]); # copy of $c in next round
&Xupdate_ssse3_32_79(\&body_40_59);
&Xupdate_ssse3_32_79(\&body_40_59);
&Xupdate_ssse3_32_79(\&body_40_59);
@ -859,7 +859,7 @@ sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
sub Xupdate_avx_32_79()
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions
my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
my ($a,$b,$c,$d,$e);
&vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
@ -1002,6 +1002,7 @@ ___
&Xupdate_avx_32_79(\&body_20_39);
&Xupdate_avx_32_79(\&body_20_39);
&Xupdate_avx_32_79(\&body_20_39);
&mov (@T[1],@V[2]); # copy of $c in next round
&Xupdate_avx_32_79(\&body_40_59);
&Xupdate_avx_32_79(\&body_40_59);
&Xupdate_avx_32_79(\&body_40_59);