ghash-x86[_64].pl: ~15% improvement on Atom Silvermont
(other processors unaffected). (cherry picked from commit 98e143f118aedc2fa79fa0ae90f1b039da106309)
This commit is contained in:
parent
a2317c3ffd
commit
7078d93307
@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_;
|
|||||||
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
|
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
|
||||||
&movdqa ($Xhn,$Xn);
|
&movdqa ($Xhn,$Xn);
|
||||||
&pxor ($T1,$Xn); #
|
&pxor ($T1,$Xn); #
|
||||||
|
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||||
|
|
||||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
|
||||||
&pclmulqdq ($T1,$T3,0x00); #######
|
&pclmulqdq ($T1,$T3,0x00); #######
|
||||||
|
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||||
|
&nop ();
|
||||||
|
|
||||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
|
||||||
&sub ($len,0x20);
|
&sub ($len,0x20);
|
||||||
&jbe (&label("even_tail"));
|
&jbe (&label("even_tail"));
|
||||||
&jmp (&label("mod_loop"));
|
&jmp (&label("mod_loop"));
|
||||||
@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_;
|
|||||||
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
|
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
|
||||||
&movdqa ($Xhi,$Xi);
|
&movdqa ($Xhi,$Xi);
|
||||||
&pxor ($T2,$Xi); #
|
&pxor ($T2,$Xi); #
|
||||||
|
&nop ();
|
||||||
|
|
||||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||||
&movups ($Hkey,&QWP(0,$Htbl)); # load H
|
|
||||||
&pclmulqdq ($T2,$T3,0x10); #######
|
&pclmulqdq ($T2,$T3,0x10); #######
|
||||||
&movdqa ($T3,&QWP(0,$const));
|
&movups ($Hkey,&QWP(0,$Htbl)); # load H
|
||||||
|
|
||||||
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||||
|
&movdqa ($T3,&QWP(0,$const));
|
||||||
&xorps ($Xhi,$Xhn);
|
&xorps ($Xhi,$Xhn);
|
||||||
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
|
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
|
||||||
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
|
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
|
||||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||||
&pxor ($T1,$Xhi); #
|
&pxor ($T1,$Xhi); #
|
||||||
|
|
||||||
&pxor ($T2,$T1); #
|
|
||||||
&pshufb ($Xhn,$T3);
|
&pshufb ($Xhn,$T3);
|
||||||
|
&pxor ($T2,$T1); #
|
||||||
|
|
||||||
&movdqa ($T1,$T2); #
|
&movdqa ($T1,$T2); #
|
||||||
&psrldq ($T2,8);
|
&psrldq ($T2,8);
|
||||||
@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_;
|
|||||||
&pxor ($T1,$Xi); #
|
&pxor ($T1,$Xi); #
|
||||||
&psllq ($Xi,1);
|
&psllq ($Xi,1);
|
||||||
&pxor ($Xi,$T1); #
|
&pxor ($Xi,$T1); #
|
||||||
&movups ($T3,&QWP(32,$Htbl));
|
|
||||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||||
|
&movups ($T3,&QWP(32,$Htbl));
|
||||||
&psllq ($Xi,57); #
|
&psllq ($Xi,57); #
|
||||||
&movdqa ($T1,$Xi); #
|
&movdqa ($T1,$Xi); #
|
||||||
&pslldq ($Xi,8);
|
&pslldq ($Xi,8);
|
||||||
@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_;
|
|||||||
&movdqa ($T2,$Xi); # 2nd phase
|
&movdqa ($T2,$Xi); # 2nd phase
|
||||||
&psrlq ($Xi,1);
|
&psrlq ($Xi,1);
|
||||||
&pxor ($T1,$Xhn);
|
&pxor ($T1,$Xhn);
|
||||||
|
&pxor ($Xhi,$T2); #
|
||||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||||
&pxor ($Xhi,$T2); #
|
|
||||||
&pxor ($T2,$Xi);
|
&pxor ($T2,$Xi);
|
||||||
&psrlq ($Xi,5);
|
&psrlq ($Xi,5);
|
||||||
&pxor ($Xi,$T2); #
|
&pxor ($Xi,$T2); #
|
||||||
|
@ -214,6 +214,7 @@ ___
|
|||||||
|
|
||||||
$code=<<___;
|
$code=<<___;
|
||||||
.text
|
.text
|
||||||
|
.extern OPENSSL_ia32cap_P
|
||||||
|
|
||||||
.globl gcm_gmult_4bit
|
.globl gcm_gmult_4bit
|
||||||
.type gcm_gmult_4bit,\@function,2
|
.type gcm_gmult_4bit,\@function,2
|
||||||
@ -597,7 +598,8 @@ ___
|
|||||||
}
|
}
|
||||||
|
|
||||||
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
|
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
|
||||||
my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
|
my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
|
||||||
|
my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.globl gcm_ghash_clmul
|
.globl gcm_ghash_clmul
|
||||||
@ -624,7 +626,6 @@ $code.=<<___ if ($win64);
|
|||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
movdqa .Lbswap_mask(%rip),$T3
|
movdqa .Lbswap_mask(%rip),$T3
|
||||||
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
|
||||||
|
|
||||||
movdqu ($Xip),$Xi
|
movdqu ($Xip),$Xi
|
||||||
movdqu ($Htbl),$Hkey
|
movdqu ($Htbl),$Hkey
|
||||||
@ -640,10 +641,16 @@ if ($do4xaggr) {
|
|||||||
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
mov OPENSSL_ia32cap_P+4(%rip),%eax
|
||||||
cmp \$0x30,$len
|
cmp \$0x30,$len
|
||||||
jb .Lskip4x
|
jb .Lskip4x
|
||||||
|
|
||||||
|
and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
|
||||||
|
cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
|
||||||
|
je .Lskip4x
|
||||||
|
|
||||||
sub \$0x30,$len
|
sub \$0x30,$len
|
||||||
|
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
||||||
movdqu 0x30($Htbl),$Hkey3
|
movdqu 0x30($Htbl),$Hkey3
|
||||||
movdqu 0x40($Htbl),$Hkey4
|
movdqu 0x40($Htbl),$Hkey4
|
||||||
|
|
||||||
@ -819,51 +826,54 @@ $code.=<<___;
|
|||||||
pxor $T1,$Xi # Ii+Xi
|
pxor $T1,$Xi # Ii+Xi
|
||||||
|
|
||||||
movdqa $Xln,$Xhn
|
movdqa $Xln,$Xhn
|
||||||
pshufd \$0b01001110,$Xln,$T1
|
pshufd \$0b01001110,$Xln,$Xmn
|
||||||
pxor $Xln,$T1
|
pxor $Xln,$Xmn
|
||||||
pclmulqdq \$0x00,$Hkey,$Xln
|
pclmulqdq \$0x00,$Hkey,$Xln
|
||||||
pclmulqdq \$0x11,$Hkey,$Xhn
|
pclmulqdq \$0x11,$Hkey,$Xhn
|
||||||
pclmulqdq \$0x00,$HK,$T1
|
pclmulqdq \$0x00,$HK,$Xmn
|
||||||
|
|
||||||
lea 32($inp),$inp # i+=2
|
lea 32($inp),$inp # i+=2
|
||||||
|
nop
|
||||||
sub \$0x20,$len
|
sub \$0x20,$len
|
||||||
jbe .Leven_tail
|
jbe .Leven_tail
|
||||||
|
nop
|
||||||
jmp .Lmod_loop
|
jmp .Lmod_loop
|
||||||
|
|
||||||
.align 32
|
.align 32
|
||||||
.Lmod_loop:
|
.Lmod_loop:
|
||||||
movdqa $Xi,$Xhi
|
movdqa $Xi,$Xhi
|
||||||
pshufd \$0b01001110,$Xi,$T2 #
|
movdqa $Xmn,$T1
|
||||||
pxor $Xi,$T2 #
|
pshufd \$0b01001110,$Xi,$Xmn #
|
||||||
|
pxor $Xi,$Xmn #
|
||||||
|
|
||||||
pclmulqdq \$0x00,$Hkey2,$Xi
|
pclmulqdq \$0x00,$Hkey2,$Xi
|
||||||
pclmulqdq \$0x11,$Hkey2,$Xhi
|
pclmulqdq \$0x11,$Hkey2,$Xhi
|
||||||
pclmulqdq \$0x10,$HK,$T2
|
pclmulqdq \$0x10,$HK,$Xmn
|
||||||
|
|
||||||
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||||
pxor $Xhn,$Xhi
|
pxor $Xhn,$Xhi
|
||||||
movdqu ($inp),$Xhn # Ii
|
movdqu ($inp),$Xhn # Ii
|
||||||
|
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
||||||
pshufb $T3,$Xhn
|
pshufb $T3,$Xhn
|
||||||
movdqu 16($inp),$Xln # Ii+1
|
movdqu 16($inp),$Xln # Ii+1
|
||||||
|
|
||||||
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
|
||||||
pxor $Xhi,$T1
|
pxor $Xhi,$T1
|
||||||
pxor $Xhn,$Xhi # "Ii+Xi", consume early
|
pxor $Xhn,$Xhi # "Ii+Xi", consume early
|
||||||
pxor $T1,$T2
|
pxor $T1,$Xmn
|
||||||
pshufb $T3,$Xln
|
pshufb $T3,$Xln
|
||||||
movdqa $T2,$T1 #
|
movdqa $Xmn,$T1 #
|
||||||
psrldq \$8,$T1
|
psrldq \$8,$T1
|
||||||
pslldq \$8,$T2 #
|
pslldq \$8,$Xmn #
|
||||||
pxor $T1,$Xhi
|
pxor $T1,$Xhi
|
||||||
pxor $T2,$Xi #
|
pxor $Xmn,$Xi #
|
||||||
|
|
||||||
movdqa $Xln,$Xhn #
|
movdqa $Xln,$Xhn #
|
||||||
|
|
||||||
movdqa $Xi,$T2 # 1st phase
|
movdqa $Xi,$T2 # 1st phase
|
||||||
movdqa $Xi,$T1
|
movdqa $Xi,$T1
|
||||||
psllq \$5,$Xi
|
psllq \$5,$Xi
|
||||||
pclmulqdq \$0x00,$Hkey,$Xln #######
|
|
||||||
pxor $Xi,$T1 #
|
pxor $Xi,$T1 #
|
||||||
|
pclmulqdq \$0x00,$Hkey,$Xln #######
|
||||||
psllq \$1,$Xi
|
psllq \$1,$Xi
|
||||||
pxor $T1,$Xi #
|
pxor $T1,$Xi #
|
||||||
psllq \$57,$Xi #
|
psllq \$57,$Xi #
|
||||||
@ -871,9 +881,9 @@ $code.=<<___;
|
|||||||
pslldq \$8,$Xi
|
pslldq \$8,$Xi
|
||||||
psrldq \$8,$T1 #
|
psrldq \$8,$T1 #
|
||||||
pxor $T2,$Xi
|
pxor $T2,$Xi
|
||||||
|
pshufd \$0b01001110,$Xhn,$Xmn
|
||||||
pxor $T1,$Xhi #
|
pxor $T1,$Xhi #
|
||||||
pshufd \$0b01001110,$Xhn,$T1
|
pxor $Xhn,$Xmn #
|
||||||
pxor $Xhn,$T1 #
|
|
||||||
|
|
||||||
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
||||||
movdqa $Xi,$T2 # 2nd phase
|
movdqa $Xi,$T2 # 2nd phase
|
||||||
@ -882,33 +892,35 @@ $code.=<<___;
|
|||||||
pxor $Xi,$T2
|
pxor $Xi,$T2
|
||||||
psrlq \$5,$Xi
|
psrlq \$5,$Xi
|
||||||
pxor $T2,$Xi #
|
pxor $T2,$Xi #
|
||||||
psrlq \$1,$Xi #
|
|
||||||
pclmulqdq \$0x00,$HK,$T1 #######
|
|
||||||
pxor $Xhi,$Xi #
|
|
||||||
|
|
||||||
lea 32($inp),$inp
|
lea 32($inp),$inp
|
||||||
|
psrlq \$1,$Xi #
|
||||||
|
pclmulqdq \$0x00,$HK,$Xmn #######
|
||||||
|
pxor $Xhi,$Xi #
|
||||||
|
.byte 0x66,0x90
|
||||||
|
|
||||||
sub \$0x20,$len
|
sub \$0x20,$len
|
||||||
ja .Lmod_loop
|
ja .Lmod_loop
|
||||||
|
|
||||||
.Leven_tail:
|
.Leven_tail:
|
||||||
movdqa $Xi,$Xhi
|
movdqa $Xi,$Xhi
|
||||||
pshufd \$0b01001110,$Xi,$T2 #
|
movdqa $Xmn,$T1
|
||||||
pxor $Xi,$T2 #
|
pshufd \$0b01001110,$Xi,$Xmn #
|
||||||
|
pxor $Xi,$Xmn #
|
||||||
|
|
||||||
pclmulqdq \$0x00,$Hkey2,$Xi
|
pclmulqdq \$0x00,$Hkey2,$Xi
|
||||||
pclmulqdq \$0x11,$Hkey2,$Xhi
|
pclmulqdq \$0x11,$Hkey2,$Xhi
|
||||||
pclmulqdq \$0x10,$HK,$T2
|
pclmulqdq \$0x10,$HK,$Xmn
|
||||||
|
|
||||||
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||||
pxor $Xhn,$Xhi
|
pxor $Xhn,$Xhi
|
||||||
pxor $Xi,$T1
|
pxor $Xi,$T1
|
||||||
pxor $Xhi,$T1
|
pxor $Xhi,$T1
|
||||||
pxor $T1,$T2
|
pxor $T1,$Xmn
|
||||||
movdqa $T2,$T1 #
|
movdqa $Xmn,$T1 #
|
||||||
psrldq \$8,$T1
|
psrldq \$8,$T1
|
||||||
pslldq \$8,$T2 #
|
pslldq \$8,$Xmn #
|
||||||
pxor $T1,$Xhi
|
pxor $T1,$Xhi
|
||||||
pxor $T2,$Xi #
|
pxor $Xmn,$Xi #
|
||||||
___
|
___
|
||||||
&reduction_alg9 ($Xhi,$Xi);
|
&reduction_alg9 ($Xhi,$Xi);
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user