ghash-x86_64.pl: optimize for upcoming Atom.

This commit is contained in:
Andy Polyakov 2014-06-11 11:34:18 +02:00
parent 619b94667c
commit 1cf8f57b43

View File

@ -59,11 +59,11 @@
# longer. A CPU with higher pclmulqdq issue rate would also benefit # longer. A CPU with higher pclmulqdq issue rate would also benefit
# from higher aggregate factor... # from higher aggregate factor...
# #
# Westmere 1.76(+14%) # Westmere 1.78(+13%)
# Sandy Bridge 1.79(+9%) # Sandy Bridge 1.80(+8%)
# Ivy Bridge 1.79(+8%) # Ivy Bridge 1.80(+7%)
# Haswell 0.55(+93%) (if system doesn't support AVX) # Haswell 0.55(+93%) (if system doesn't support AVX)
# Bulldozer 1.52(+25%) # Bulldozer 1.49(+27%)
# March 2013 # March 2013
# #
@ -673,8 +673,8 @@ $code.=<<___;
pxor $Xl,$Xm pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x00,$Hkey2,$Xl
pclmulqdq \$0x11,$Hkey2,$Xh pclmulqdq \$0x11,$Hkey2,$Xh
xorps $Xl,$Xln
pclmulqdq \$0x10,$HK,$Xm pclmulqdq \$0x10,$HK,$Xm
xorps $Xl,$Xln
xorps $Xh,$Xhn xorps $Xh,$Xhn
movups 0x50($Htbl),$HK movups 0x50($Htbl),$HK
xorps $Xm,$Xmn xorps $Xm,$Xmn
@ -692,8 +692,8 @@ $code.=<<___;
pshufd \$0b01001110,$Xi,$T1 pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1 pxor $Xi,$T1
pclmulqdq \$0x11,$Hkey3,$Xh pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln
pclmulqdq \$0x00,$HK,$Xm pclmulqdq \$0x00,$HK,$Xm
xorps $Xl,$Xln
xorps $Xh,$Xhn xorps $Xh,$Xhn
lea 0x40($inp),$inp lea 0x40($inp),$inp
@ -711,23 +711,23 @@ $code.=<<___;
xorps $Xln,$Xi xorps $Xln,$Xi
movdqu 0x20($inp),$Xln movdqu 0x20($inp),$Xln
movdqa $Xl,$Xh movdqa $Xl,$Xh
pshufd \$0b01001110,$Xl,$Xm
pclmulqdq \$0x10,$HK,$T1 pclmulqdq \$0x10,$HK,$T1
pshufd \$0b01001110,$Xl,$Xm
xorps $Xhn,$Xhi xorps $Xhn,$Xhi
pxor $Xl,$Xm pxor $Xl,$Xm
pshufb $T3,$Xln pshufb $T3,$Xln
movups 0x20($Htbl),$HK movups 0x20($Htbl),$HK
pclmulqdq \$0x00,$Hkey,$Xl
xorps $Xmn,$T1 xorps $Xmn,$T1
movdqa $Xln,$Xhn pclmulqdq \$0x00,$Hkey,$Xl
pshufd \$0b01001110,$Xln,$Xmn pshufd \$0b01001110,$Xln,$Xmn
pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xi,$T1 # aggregated Karatsuba post-processing
pxor $Xln,$Xmn movdqa $Xln,$Xhn
pxor $Xhi,$T1 # pxor $Xhi,$T1 #
pxor $Xln,$Xmn
movdqa $T1,$T2 # movdqa $T1,$T2 #
pslldq \$8,$T1
pclmulqdq \$0x11,$Hkey,$Xh pclmulqdq \$0x11,$Hkey,$Xh
pslldq \$8,$T1
psrldq \$8,$T2 # psrldq \$8,$T2 #
pxor $T1,$Xi pxor $T1,$Xi
movdqa .L7_mask(%rip),$T1 movdqa .L7_mask(%rip),$T1
@ -736,8 +736,8 @@ $code.=<<___;
pand $Xi,$T1 # 1st phase pand $Xi,$T1 # 1st phase
pshufb $T1,$T2 # pshufb $T1,$T2 #
pclmulqdq \$0x00,$HK,$Xm
pxor $Xi,$T2 # pxor $Xi,$T2 #
pclmulqdq \$0x00,$HK,$Xm
psllq \$57,$T2 # psllq \$57,$T2 #
movdqa $T2,$T1 # movdqa $T2,$T1 #
pslldq \$8,$T2 pslldq \$8,$T2
@ -764,32 +764,31 @@ $code.=<<___;
movdqa $Xl,$Xh movdqa $Xl,$Xh
pxor $Xm,$Xmn pxor $Xm,$Xmn
pshufd \$0b01001110,$Xl,$Xm pshufd \$0b01001110,$Xl,$Xm
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey3,$Xl
pxor $T2,$Xi # pxor $T2,$Xi #
pxor $T1,$Xhi pxor $T1,$Xhi
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey3,$Xl
psrlq \$1,$Xi # psrlq \$1,$Xi #
pxor $Xhi,$Xi #
movdqa $Xi,$Xhi
pclmulqdq \$0x11,$Hkey3,$Xh pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln xorps $Xl,$Xln
pxor $Xhi,$Xi # pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
pclmulqdq \$0x00,$HK,$Xm pclmulqdq \$0x00,$HK,$Xm
xorps $Xh,$Xhn xorps $Xh,$Xhn
movdqa $Xi,$Xhi
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
lea 0x40($inp),$inp lea 0x40($inp),$inp
sub \$0x40,$len sub \$0x40,$len
jnc .Lmod4_loop jnc .Lmod4_loop
.Ltail4x: .Ltail4x:
pclmulqdq \$0x00,$Hkey4,$Xi pclmulqdq \$0x00,$Hkey4,$Xi
xorps $Xm,$Xmn
pclmulqdq \$0x11,$Hkey4,$Xhi pclmulqdq \$0x11,$Hkey4,$Xhi
xorps $Xln,$Xi
pclmulqdq \$0x10,$HK,$T1 pclmulqdq \$0x10,$HK,$T1
xorps $Xm,$Xmn
xorps $Xln,$Xi
xorps $Xhn,$Xhi xorps $Xhn,$Xhi
pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xi,$Xhi # aggregated Karatsuba post-processing
pxor $Xmn,$T1 pxor $Xmn,$T1
@ -852,13 +851,13 @@ $code.=<<___;
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi pxor $Xhn,$Xhi
movdqu ($inp),$Xhn # Ii movdqu ($inp),$T2 # Ii
pxor $Xi,$T1 # aggregated Karatsuba post-processing pxor $Xi,$T1 # aggregated Karatsuba post-processing
pshufb $T3,$Xhn pshufb $T3,$T2
movdqu 16($inp),$Xln # Ii+1 movdqu 16($inp),$Xln # Ii+1
pxor $Xhi,$T1 pxor $Xhi,$T1
pxor $Xhn,$Xhi # "Ii+Xi", consume early pxor $T2,$Xhi # "Ii+Xi", consume early
pxor $T1,$Xmn pxor $T1,$Xmn
pshufb $T3,$Xln pshufb $T3,$Xln
movdqa $Xmn,$T1 # movdqa $Xmn,$T1 #
@ -885,9 +884,9 @@ $code.=<<___;
pxor $T1,$Xhi # pxor $T1,$Xhi #
pxor $Xhn,$Xmn # pxor $Xhn,$Xmn #
pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase movdqa $Xi,$T2 # 2nd phase
psrlq \$1,$Xi psrlq \$1,$Xi
pclmulqdq \$0x11,$Hkey,$Xhn #######
pxor $T2,$Xhi # pxor $T2,$Xhi #
pxor $Xi,$T2 pxor $Xi,$T2
psrlq \$5,$Xi psrlq \$5,$Xi
@ -896,7 +895,6 @@ $code.=<<___;
psrlq \$1,$Xi # psrlq \$1,$Xi #
pclmulqdq \$0x00,$HK,$Xmn ####### pclmulqdq \$0x00,$HK,$Xmn #######
pxor $Xhi,$Xi # pxor $Xhi,$Xi #
.byte 0x66,0x90
sub \$0x20,$len sub \$0x20,$len
ja .Lmod_loop ja .Lmod_loop