ghash-x86_64.pl: optimize for upcoming Atom.
This commit is contained in:
parent
619b94667c
commit
1cf8f57b43
@ -59,11 +59,11 @@
|
|||||||
# longer. A CPU with higher pclmulqdq issue rate would also benefit
|
# longer. A CPU with higher pclmulqdq issue rate would also benefit
|
||||||
# from higher aggregate factor...
|
# from higher aggregate factor...
|
||||||
#
|
#
|
||||||
# Westmere 1.76(+14%)
|
# Westmere 1.78(+13%)
|
||||||
# Sandy Bridge 1.79(+9%)
|
# Sandy Bridge 1.80(+8%)
|
||||||
# Ivy Bridge 1.79(+8%)
|
# Ivy Bridge 1.80(+7%)
|
||||||
# Haswell 0.55(+93%) (if system doesn't support AVX)
|
# Haswell 0.55(+93%) (if system doesn't support AVX)
|
||||||
# Bulldozer 1.52(+25%)
|
# Bulldozer 1.49(+27%)
|
||||||
|
|
||||||
# March 2013
|
# March 2013
|
||||||
#
|
#
|
||||||
@ -673,8 +673,8 @@ $code.=<<___;
|
|||||||
pxor $Xl,$Xm
|
pxor $Xl,$Xm
|
||||||
pclmulqdq \$0x00,$Hkey2,$Xl
|
pclmulqdq \$0x00,$Hkey2,$Xl
|
||||||
pclmulqdq \$0x11,$Hkey2,$Xh
|
pclmulqdq \$0x11,$Hkey2,$Xh
|
||||||
xorps $Xl,$Xln
|
|
||||||
pclmulqdq \$0x10,$HK,$Xm
|
pclmulqdq \$0x10,$HK,$Xm
|
||||||
|
xorps $Xl,$Xln
|
||||||
xorps $Xh,$Xhn
|
xorps $Xh,$Xhn
|
||||||
movups 0x50($Htbl),$HK
|
movups 0x50($Htbl),$HK
|
||||||
xorps $Xm,$Xmn
|
xorps $Xm,$Xmn
|
||||||
@ -692,8 +692,8 @@ $code.=<<___;
|
|||||||
pshufd \$0b01001110,$Xi,$T1
|
pshufd \$0b01001110,$Xi,$T1
|
||||||
pxor $Xi,$T1
|
pxor $Xi,$T1
|
||||||
pclmulqdq \$0x11,$Hkey3,$Xh
|
pclmulqdq \$0x11,$Hkey3,$Xh
|
||||||
xorps $Xl,$Xln
|
|
||||||
pclmulqdq \$0x00,$HK,$Xm
|
pclmulqdq \$0x00,$HK,$Xm
|
||||||
|
xorps $Xl,$Xln
|
||||||
xorps $Xh,$Xhn
|
xorps $Xh,$Xhn
|
||||||
|
|
||||||
lea 0x40($inp),$inp
|
lea 0x40($inp),$inp
|
||||||
@ -711,23 +711,23 @@ $code.=<<___;
|
|||||||
xorps $Xln,$Xi
|
xorps $Xln,$Xi
|
||||||
movdqu 0x20($inp),$Xln
|
movdqu 0x20($inp),$Xln
|
||||||
movdqa $Xl,$Xh
|
movdqa $Xl,$Xh
|
||||||
pshufd \$0b01001110,$Xl,$Xm
|
|
||||||
pclmulqdq \$0x10,$HK,$T1
|
pclmulqdq \$0x10,$HK,$T1
|
||||||
|
pshufd \$0b01001110,$Xl,$Xm
|
||||||
xorps $Xhn,$Xhi
|
xorps $Xhn,$Xhi
|
||||||
pxor $Xl,$Xm
|
pxor $Xl,$Xm
|
||||||
pshufb $T3,$Xln
|
pshufb $T3,$Xln
|
||||||
movups 0x20($Htbl),$HK
|
movups 0x20($Htbl),$HK
|
||||||
pclmulqdq \$0x00,$Hkey,$Xl
|
|
||||||
xorps $Xmn,$T1
|
xorps $Xmn,$T1
|
||||||
movdqa $Xln,$Xhn
|
pclmulqdq \$0x00,$Hkey,$Xl
|
||||||
pshufd \$0b01001110,$Xln,$Xmn
|
pshufd \$0b01001110,$Xln,$Xmn
|
||||||
|
|
||||||
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
||||||
pxor $Xln,$Xmn
|
movdqa $Xln,$Xhn
|
||||||
pxor $Xhi,$T1 #
|
pxor $Xhi,$T1 #
|
||||||
|
pxor $Xln,$Xmn
|
||||||
movdqa $T1,$T2 #
|
movdqa $T1,$T2 #
|
||||||
pslldq \$8,$T1
|
|
||||||
pclmulqdq \$0x11,$Hkey,$Xh
|
pclmulqdq \$0x11,$Hkey,$Xh
|
||||||
|
pslldq \$8,$T1
|
||||||
psrldq \$8,$T2 #
|
psrldq \$8,$T2 #
|
||||||
pxor $T1,$Xi
|
pxor $T1,$Xi
|
||||||
movdqa .L7_mask(%rip),$T1
|
movdqa .L7_mask(%rip),$T1
|
||||||
@ -736,8 +736,8 @@ $code.=<<___;
|
|||||||
|
|
||||||
pand $Xi,$T1 # 1st phase
|
pand $Xi,$T1 # 1st phase
|
||||||
pshufb $T1,$T2 #
|
pshufb $T1,$T2 #
|
||||||
pclmulqdq \$0x00,$HK,$Xm
|
|
||||||
pxor $Xi,$T2 #
|
pxor $Xi,$T2 #
|
||||||
|
pclmulqdq \$0x00,$HK,$Xm
|
||||||
psllq \$57,$T2 #
|
psllq \$57,$T2 #
|
||||||
movdqa $T2,$T1 #
|
movdqa $T2,$T1 #
|
||||||
pslldq \$8,$T2
|
pslldq \$8,$T2
|
||||||
@ -764,32 +764,31 @@ $code.=<<___;
|
|||||||
movdqa $Xl,$Xh
|
movdqa $Xl,$Xh
|
||||||
pxor $Xm,$Xmn
|
pxor $Xm,$Xmn
|
||||||
pshufd \$0b01001110,$Xl,$Xm
|
pshufd \$0b01001110,$Xl,$Xm
|
||||||
pxor $Xl,$Xm
|
|
||||||
pclmulqdq \$0x00,$Hkey3,$Xl
|
|
||||||
pxor $T2,$Xi #
|
pxor $T2,$Xi #
|
||||||
pxor $T1,$Xhi
|
pxor $T1,$Xhi
|
||||||
|
pxor $Xl,$Xm
|
||||||
|
pclmulqdq \$0x00,$Hkey3,$Xl
|
||||||
psrlq \$1,$Xi #
|
psrlq \$1,$Xi #
|
||||||
|
pxor $Xhi,$Xi #
|
||||||
|
movdqa $Xi,$Xhi
|
||||||
pclmulqdq \$0x11,$Hkey3,$Xh
|
pclmulqdq \$0x11,$Hkey3,$Xh
|
||||||
xorps $Xl,$Xln
|
xorps $Xl,$Xln
|
||||||
pxor $Xhi,$Xi #
|
pshufd \$0b01001110,$Xi,$T1
|
||||||
|
pxor $Xi,$T1
|
||||||
|
|
||||||
pclmulqdq \$0x00,$HK,$Xm
|
pclmulqdq \$0x00,$HK,$Xm
|
||||||
xorps $Xh,$Xhn
|
xorps $Xh,$Xhn
|
||||||
|
|
||||||
movdqa $Xi,$Xhi
|
|
||||||
pshufd \$0b01001110,$Xi,$T1
|
|
||||||
pxor $Xi,$T1
|
|
||||||
|
|
||||||
lea 0x40($inp),$inp
|
lea 0x40($inp),$inp
|
||||||
sub \$0x40,$len
|
sub \$0x40,$len
|
||||||
jnc .Lmod4_loop
|
jnc .Lmod4_loop
|
||||||
|
|
||||||
.Ltail4x:
|
.Ltail4x:
|
||||||
pclmulqdq \$0x00,$Hkey4,$Xi
|
pclmulqdq \$0x00,$Hkey4,$Xi
|
||||||
xorps $Xm,$Xmn
|
|
||||||
pclmulqdq \$0x11,$Hkey4,$Xhi
|
pclmulqdq \$0x11,$Hkey4,$Xhi
|
||||||
xorps $Xln,$Xi
|
|
||||||
pclmulqdq \$0x10,$HK,$T1
|
pclmulqdq \$0x10,$HK,$T1
|
||||||
|
xorps $Xm,$Xmn
|
||||||
|
xorps $Xln,$Xi
|
||||||
xorps $Xhn,$Xhi
|
xorps $Xhn,$Xhi
|
||||||
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
|
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
|
||||||
pxor $Xmn,$T1
|
pxor $Xmn,$T1
|
||||||
@ -852,13 +851,13 @@ $code.=<<___;
|
|||||||
|
|
||||||
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||||
pxor $Xhn,$Xhi
|
pxor $Xhn,$Xhi
|
||||||
movdqu ($inp),$Xhn # Ii
|
movdqu ($inp),$T2 # Ii
|
||||||
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
||||||
pshufb $T3,$Xhn
|
pshufb $T3,$T2
|
||||||
movdqu 16($inp),$Xln # Ii+1
|
movdqu 16($inp),$Xln # Ii+1
|
||||||
|
|
||||||
pxor $Xhi,$T1
|
pxor $Xhi,$T1
|
||||||
pxor $Xhn,$Xhi # "Ii+Xi", consume early
|
pxor $T2,$Xhi # "Ii+Xi", consume early
|
||||||
pxor $T1,$Xmn
|
pxor $T1,$Xmn
|
||||||
pshufb $T3,$Xln
|
pshufb $T3,$Xln
|
||||||
movdqa $Xmn,$T1 #
|
movdqa $Xmn,$T1 #
|
||||||
@ -885,9 +884,9 @@ $code.=<<___;
|
|||||||
pxor $T1,$Xhi #
|
pxor $T1,$Xhi #
|
||||||
pxor $Xhn,$Xmn #
|
pxor $Xhn,$Xmn #
|
||||||
|
|
||||||
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
|
||||||
movdqa $Xi,$T2 # 2nd phase
|
movdqa $Xi,$T2 # 2nd phase
|
||||||
psrlq \$1,$Xi
|
psrlq \$1,$Xi
|
||||||
|
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
||||||
pxor $T2,$Xhi #
|
pxor $T2,$Xhi #
|
||||||
pxor $Xi,$T2
|
pxor $Xi,$T2
|
||||||
psrlq \$5,$Xi
|
psrlq \$5,$Xi
|
||||||
@ -896,7 +895,6 @@ $code.=<<___;
|
|||||||
psrlq \$1,$Xi #
|
psrlq \$1,$Xi #
|
||||||
pclmulqdq \$0x00,$HK,$Xmn #######
|
pclmulqdq \$0x00,$HK,$Xmn #######
|
||||||
pxor $Xhi,$Xi #
|
pxor $Xhi,$Xi #
|
||||||
.byte 0x66,0x90
|
|
||||||
|
|
||||||
sub \$0x20,$len
|
sub \$0x20,$len
|
||||||
ja .Lmod_loop
|
ja .Lmod_loop
|
||||||
|
Loading…
x
Reference in New Issue
Block a user