ghash-x86[_64].pl: ~15% improvement on Atom Silvermont
(other processors unaffected).
This commit is contained in:
parent
fc92396976
commit
98e143f118
@ -1021,13 +1021,14 @@ my ($Xhi,$Xi) = @_;
|
||||
&pshufd ($T1,$Xn,0b01001110); # H*Ii+1
|
||||
&movdqa ($Xhn,$Xn);
|
||||
&pxor ($T1,$Xn); #
|
||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||
|
||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
&pclmulqdq ($T1,$T3,0x00); #######
|
||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
&nop ();
|
||||
|
||||
&lea ($inp,&DWP(32,$inp)); # i+=2
|
||||
&sub ($len,0x20);
|
||||
&jbe (&label("even_tail"));
|
||||
&jmp (&label("mod_loop"));
|
||||
@ -1036,22 +1037,23 @@ my ($Xhi,$Xi) = @_;
|
||||
&pshufd ($T2,$Xi,0b01001110); # H^2*(Ii+Xi)
|
||||
&movdqa ($Xhi,$Xi);
|
||||
&pxor ($T2,$Xi); #
|
||||
&nop ();
|
||||
|
||||
&pclmulqdq ($Xi,$Hkey,0x00); #######
|
||||
&pclmulqdq ($Xhi,$Hkey,0x11); #######
|
||||
&movups ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
&pclmulqdq ($T2,$T3,0x10); #######
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&movups ($Hkey,&QWP(0,$Htbl)); # load H
|
||||
|
||||
&xorps ($Xi,$Xn); # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
&movdqa ($T3,&QWP(0,$const));
|
||||
&xorps ($Xhi,$Xhn);
|
||||
&movdqu ($Xhn,&QWP(0,$inp)); # Ii
|
||||
&pxor ($T1,$Xi); # aggregated Karatsuba post-processing
|
||||
&movdqu ($Xn,&QWP(16,$inp)); # Ii+1
|
||||
&pxor ($T1,$Xhi); #
|
||||
|
||||
&pxor ($T2,$T1); #
|
||||
&pshufb ($Xhn,$T3);
|
||||
&pxor ($T2,$T1); #
|
||||
|
||||
&movdqa ($T1,$T2); #
|
||||
&psrldq ($T2,8);
|
||||
@ -1068,8 +1070,8 @@ my ($Xhi,$Xi) = @_;
|
||||
&pxor ($T1,$Xi); #
|
||||
&psllq ($Xi,1);
|
||||
&pxor ($Xi,$T1); #
|
||||
&movups ($T3,&QWP(32,$Htbl));
|
||||
&pclmulqdq ($Xn,$Hkey,0x00); #######
|
||||
&movups ($T3,&QWP(32,$Htbl));
|
||||
&psllq ($Xi,57); #
|
||||
&movdqa ($T1,$Xi); #
|
||||
&pslldq ($Xi,8);
|
||||
@ -1080,9 +1082,9 @@ my ($Xhi,$Xi) = @_;
|
||||
&movdqa ($T2,$Xi); # 2nd phase
|
||||
&psrlq ($Xi,1);
|
||||
&pxor ($T1,$Xhn);
|
||||
&pxor ($Xhi,$T2); #
|
||||
&pclmulqdq ($Xhn,$Hkey,0x11); #######
|
||||
&movups ($Hkey,&QWP(16,$Htbl)); # load H^2
|
||||
&pxor ($Xhi,$T2); #
|
||||
&pxor ($T2,$Xi);
|
||||
&psrlq ($Xi,5);
|
||||
&pxor ($Xi,$T2); #
|
||||
|
@ -214,6 +214,7 @@ ___
|
||||
|
||||
$code=<<___;
|
||||
.text
|
||||
.extern OPENSSL_ia32cap_P
|
||||
|
||||
.globl gcm_gmult_4bit
|
||||
.type gcm_gmult_4bit,\@function,2
|
||||
@ -597,7 +598,8 @@ ___
|
||||
}
|
||||
|
||||
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
|
||||
my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
|
||||
my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
|
||||
my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
|
||||
|
||||
$code.=<<___;
|
||||
.globl gcm_ghash_clmul
|
||||
@ -624,7 +626,6 @@ $code.=<<___ if ($win64);
|
||||
___
|
||||
$code.=<<___;
|
||||
movdqa .Lbswap_mask(%rip),$T3
|
||||
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
||||
|
||||
movdqu ($Xip),$Xi
|
||||
movdqu ($Htbl),$Hkey
|
||||
@ -640,10 +641,16 @@ if ($do4xaggr) {
|
||||
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
||||
|
||||
$code.=<<___;
|
||||
mov OPENSSL_ia32cap_P+4(%rip),%eax
|
||||
cmp \$0x30,$len
|
||||
jb .Lskip4x
|
||||
|
||||
and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
|
||||
cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
|
||||
je .Lskip4x
|
||||
|
||||
sub \$0x30,$len
|
||||
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
||||
movdqu 0x30($Htbl),$Hkey3
|
||||
movdqu 0x40($Htbl),$Hkey4
|
||||
|
||||
@ -819,51 +826,54 @@ $code.=<<___;
|
||||
pxor $T1,$Xi # Ii+Xi
|
||||
|
||||
movdqa $Xln,$Xhn
|
||||
pshufd \$0b01001110,$Xln,$T1
|
||||
pxor $Xln,$T1
|
||||
pshufd \$0b01001110,$Xln,$Xmn
|
||||
pxor $Xln,$Xmn
|
||||
pclmulqdq \$0x00,$Hkey,$Xln
|
||||
pclmulqdq \$0x11,$Hkey,$Xhn
|
||||
pclmulqdq \$0x00,$HK,$T1
|
||||
pclmulqdq \$0x00,$HK,$Xmn
|
||||
|
||||
lea 32($inp),$inp # i+=2
|
||||
nop
|
||||
sub \$0x20,$len
|
||||
jbe .Leven_tail
|
||||
nop
|
||||
jmp .Lmod_loop
|
||||
|
||||
.align 32
|
||||
.Lmod_loop:
|
||||
movdqa $Xi,$Xhi
|
||||
pshufd \$0b01001110,$Xi,$T2 #
|
||||
pxor $Xi,$T2 #
|
||||
movdqa $Xmn,$T1
|
||||
pshufd \$0b01001110,$Xi,$Xmn #
|
||||
pxor $Xi,$Xmn #
|
||||
|
||||
pclmulqdq \$0x00,$Hkey2,$Xi
|
||||
pclmulqdq \$0x11,$Hkey2,$Xhi
|
||||
pclmulqdq \$0x10,$HK,$T2
|
||||
pclmulqdq \$0x10,$HK,$Xmn
|
||||
|
||||
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
pxor $Xhn,$Xhi
|
||||
movdqu ($inp),$Xhn # Ii
|
||||
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
||||
pshufb $T3,$Xhn
|
||||
movdqu 16($inp),$Xln # Ii+1
|
||||
|
||||
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
||||
pxor $Xhi,$T1
|
||||
pxor $Xhn,$Xhi # "Ii+Xi", consume early
|
||||
pxor $T1,$T2
|
||||
pxor $T1,$Xmn
|
||||
pshufb $T3,$Xln
|
||||
movdqa $T2,$T1 #
|
||||
movdqa $Xmn,$T1 #
|
||||
psrldq \$8,$T1
|
||||
pslldq \$8,$T2 #
|
||||
pslldq \$8,$Xmn #
|
||||
pxor $T1,$Xhi
|
||||
pxor $T2,$Xi #
|
||||
pxor $Xmn,$Xi #
|
||||
|
||||
movdqa $Xln,$Xhn #
|
||||
|
||||
movdqa $Xi,$T2 # 1st phase
|
||||
movdqa $Xi,$T1
|
||||
psllq \$5,$Xi
|
||||
pclmulqdq \$0x00,$Hkey,$Xln #######
|
||||
pxor $Xi,$T1 #
|
||||
pclmulqdq \$0x00,$Hkey,$Xln #######
|
||||
psllq \$1,$Xi
|
||||
pxor $T1,$Xi #
|
||||
psllq \$57,$Xi #
|
||||
@ -871,9 +881,9 @@ $code.=<<___;
|
||||
pslldq \$8,$Xi
|
||||
psrldq \$8,$T1 #
|
||||
pxor $T2,$Xi
|
||||
pshufd \$0b01001110,$Xhn,$Xmn
|
||||
pxor $T1,$Xhi #
|
||||
pshufd \$0b01001110,$Xhn,$T1
|
||||
pxor $Xhn,$T1 #
|
||||
pxor $Xhn,$Xmn #
|
||||
|
||||
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
||||
movdqa $Xi,$T2 # 2nd phase
|
||||
@ -882,33 +892,35 @@ $code.=<<___;
|
||||
pxor $Xi,$T2
|
||||
psrlq \$5,$Xi
|
||||
pxor $T2,$Xi #
|
||||
psrlq \$1,$Xi #
|
||||
pclmulqdq \$0x00,$HK,$T1 #######
|
||||
pxor $Xhi,$Xi #
|
||||
|
||||
lea 32($inp),$inp
|
||||
psrlq \$1,$Xi #
|
||||
pclmulqdq \$0x00,$HK,$Xmn #######
|
||||
pxor $Xhi,$Xi #
|
||||
.byte 0x66,0x90
|
||||
|
||||
sub \$0x20,$len
|
||||
ja .Lmod_loop
|
||||
|
||||
.Leven_tail:
|
||||
movdqa $Xi,$Xhi
|
||||
pshufd \$0b01001110,$Xi,$T2 #
|
||||
pxor $Xi,$T2 #
|
||||
movdqa $Xmn,$T1
|
||||
pshufd \$0b01001110,$Xi,$Xmn #
|
||||
pxor $Xi,$Xmn #
|
||||
|
||||
pclmulqdq \$0x00,$Hkey2,$Xi
|
||||
pclmulqdq \$0x11,$Hkey2,$Xhi
|
||||
pclmulqdq \$0x10,$HK,$T2
|
||||
pclmulqdq \$0x10,$HK,$Xmn
|
||||
|
||||
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
||||
pxor $Xhn,$Xhi
|
||||
pxor $Xi,$T1
|
||||
pxor $Xhi,$T1
|
||||
pxor $T1,$T2
|
||||
movdqa $T2,$T1 #
|
||||
pxor $T1,$Xmn
|
||||
movdqa $Xmn,$T1 #
|
||||
psrldq \$8,$T1
|
||||
pslldq \$8,$T2 #
|
||||
pslldq \$8,$Xmn #
|
||||
pxor $T1,$Xhi
|
||||
pxor $T2,$Xi #
|
||||
pxor $Xmn,$Xi #
|
||||
___
|
||||
&reduction_alg9 ($Xhi,$Xi);
|
||||
$code.=<<___;
|
||||
|
Loading…
x
Reference in New Issue
Block a user