ghash-x86.pl: commentary updates.
This commit is contained in:
parent
8525950e7e
commit
04e2b793d6
@ -30,6 +30,8 @@
|
|||||||
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
||||||
# which is one of reasons why 2.95.3 results were chosen,
|
# which is one of reasons why 2.95.3 results were chosen,
|
||||||
# another reason is lack of 3.4.x results for older CPUs;
|
# another reason is lack of 3.4.x results for older CPUs;
|
||||||
|
# comparison is not completely fair, because C results are
|
||||||
|
# for vanilla "256B" implementations, not "528B";-)
|
||||||
# (**) second number is result for code compiled with -fPIC flag,
|
# (**) second number is result for code compiled with -fPIC flag,
|
||||||
# which is actually more relevant, because assembler code is
|
# which is actually more relevant, because assembler code is
|
||||||
# position-independent;
|
# position-independent;
|
||||||
@ -472,8 +474,8 @@ $S=12; # shift factor for rem_4bit
|
|||||||
&function_end("gcm_ghash_4bit_mmx");
|
&function_end("gcm_ghash_4bit_mmx");
|
||||||
|
|
||||||
}} else {{ # "June" MMX version...
|
}} else {{ # "June" MMX version...
|
||||||
# ... has "April" gcm_gmult_4bit_mmx with folded loop.
|
# ... has slower "April" gcm_gmult_4bit_mmx with folded
|
||||||
# This is done to conserve code size...
|
# loop. This is done to conserve code size...
|
||||||
$S=16; # shift factor for rem_4bit
|
$S=16; # shift factor for rem_4bit
|
||||||
|
|
||||||
sub mmx_loop() {
|
sub mmx_loop() {
|
||||||
@ -592,7 +594,7 @@ sub mmx_loop() {
|
|||||||
######################################################################
|
######################################################################
|
||||||
# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
|
# Below subroutine is "528B" variant of "4-bit" GCM GHASH function
|
||||||
# (see gcm128.c for details). It provides further 20-40% performance
|
# (see gcm128.c for details). It provides further 20-40% performance
|
||||||
# improvement over *previous* version of this module.
|
# improvement over above mentioned "May" version.
|
||||||
|
|
||||||
&static_label("rem_8bit");
|
&static_label("rem_8bit");
|
||||||
|
|
||||||
@ -631,7 +633,7 @@ sub mmx_loop() {
|
|||||||
&lea ("ebp",&DWP(16+256+128,"esp"));
|
&lea ("ebp",&DWP(16+256+128,"esp"));
|
||||||
|
|
||||||
# decompose Htable (low and high parts are kept separately),
|
# decompose Htable (low and high parts are kept separately),
|
||||||
# generate Htable>>4, save to stack...
|
# generate Htable[]>>4, (u8)(Htable[]<<4), save to stack...
|
||||||
for ($i=0;$i<18;$i++) {
|
for ($i=0;$i<18;$i++) {
|
||||||
|
|
||||||
&mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
|
&mov ("edx",&DWP(16*$i+8-128,$Htbl)) if ($i<16);
|
||||||
@ -669,7 +671,7 @@ sub mmx_loop() {
|
|||||||
my @red = ("mm0","mm1","mm2");
|
my @red = ("mm0","mm1","mm2");
|
||||||
my $tmp = "mm3";
|
my $tmp = "mm3";
|
||||||
|
|
||||||
&xor ($dat,&DWP(12,"ecx")); # merge input
|
&xor ($dat,&DWP(12,"ecx")); # merge input data
|
||||||
&xor ("ebx",&DWP(8,"ecx"));
|
&xor ("ebx",&DWP(8,"ecx"));
|
||||||
&pxor ($Zhi,&QWP(0,"ecx"));
|
&pxor ($Zhi,&QWP(0,"ecx"));
|
||||||
&lea ("ecx",&DWP(16,"ecx")); # inp+=16
|
&lea ("ecx",&DWP(16,"ecx")); # inp+=16
|
||||||
@ -685,15 +687,15 @@ sub mmx_loop() {
|
|||||||
&and (&LB($nlo),0x0f);
|
&and (&LB($nlo),0x0f);
|
||||||
&shr ($nhi[1],4);
|
&shr ($nhi[1],4);
|
||||||
&pxor ($red[0],$red[0]);
|
&pxor ($red[0],$red[0]);
|
||||||
&rol ($dat,8); # next byte
|
&rol ($dat,8); # next byte
|
||||||
&pxor ($red[1],$red[1]);
|
&pxor ($red[1],$red[1]);
|
||||||
&pxor ($red[2],$red[2]);
|
&pxor ($red[2],$red[2]);
|
||||||
|
|
||||||
# Just like in "May" verson modulo-schedule for critical path in
|
# Just like in "May" verson modulo-schedule for critical path in
|
||||||
# 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final xor
|
# 'Z.hi ^= rem_8bit[Z.lo&0xff^((u8)H[nhi]<<4)]<<48'. Final 'pxor'
|
||||||
# is scheduled so late that rem_8bit is shifted *right* by 16,
|
# is scheduled so late that rem_8bit[] has to be shifted *right*
|
||||||
# which is why last argument to pinsrw is 2, which corresponds to
|
# by 16, which is why last argument to pinsrw is 2, which
|
||||||
# <<32...
|
# corresponds to <<32=<<48>>16...
|
||||||
for ($j=11,$i=0;$i<15;$i++) {
|
for ($j=11,$i=0;$i<15;$i++) {
|
||||||
|
|
||||||
if ($i>0) {
|
if ($i>0) {
|
||||||
@ -703,18 +705,18 @@ sub mmx_loop() {
|
|||||||
|
|
||||||
&pxor ($Zlo,$tmp);
|
&pxor ($Zlo,$tmp);
|
||||||
&pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
|
&pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
|
||||||
&xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^H[nhi]<<4
|
&xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
|
||||||
} else {
|
} else {
|
||||||
&movq ($Zlo,&QWP(16,"esp",$nlo,8));
|
&movq ($Zlo,&QWP(16,"esp",$nlo,8));
|
||||||
&movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
|
&movq ($Zhi,&QWP(16+128,"esp",$nlo,8));
|
||||||
}
|
}
|
||||||
|
|
||||||
&mov (&LB($nlo),&LB($dat));
|
&mov (&LB($nlo),&LB($dat));
|
||||||
&mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
|
&mov ($dat,&DWP(528+$j,"esp")) if (--$j%4==0);
|
||||||
|
|
||||||
&movd ($rem[0],$Zlo);
|
&movd ($rem[0],$Zlo);
|
||||||
&movz ($rem[1],&LB($rem[1])) if ($i>0);
|
&movz ($rem[1],&LB($rem[1])) if ($i>0);
|
||||||
&psrlq ($Zlo,8);
|
&psrlq ($Zlo,8); # Z>>=8
|
||||||
|
|
||||||
&movq ($tmp,$Zhi);
|
&movq ($tmp,$Zhi);
|
||||||
&mov ($nhi[0],$nlo);
|
&mov ($nhi[0],$nlo);
|
||||||
@ -735,7 +737,7 @@ sub mmx_loop() {
|
|||||||
|
|
||||||
&pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
|
&pxor ($Zlo,&QWP(16,"esp",$nlo,8)); # Z^=H[nlo]
|
||||||
&pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
|
&pxor ($Zhi,&QWP(16+128,"esp",$nlo,8));
|
||||||
&xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); #$rem[0]); # rem^H[nhi]<<4
|
&xor (&LB($rem[1]),&BP(0,"esp",$nhi[0])); # rem^(H[nhi]<<4)
|
||||||
|
|
||||||
&pxor ($Zlo,$tmp);
|
&pxor ($Zlo,$tmp);
|
||||||
&pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
|
&pxor ($Zhi,&QWP(16+256+128,"esp",$nhi[0],8));
|
||||||
@ -745,11 +747,11 @@ sub mmx_loop() {
|
|||||||
&psllq ($red[1],4);
|
&psllq ($red[1],4);
|
||||||
|
|
||||||
&movd ($rem[0],$Zlo);
|
&movd ($rem[0],$Zlo);
|
||||||
&psrlq ($Zlo,4);
|
&psrlq ($Zlo,4); # Z>>=4
|
||||||
|
|
||||||
&movq ($tmp,$Zhi);
|
&movq ($tmp,$Zhi);
|
||||||
&psrlq ($Zhi,4);
|
&psrlq ($Zhi,4);
|
||||||
&shl ($rem[0],4);
|
&shl ($rem[0],4); # rem<<4
|
||||||
|
|
||||||
&pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
|
&pxor ($Zlo,&QWP(16,"esp",$nhi[1],8)); # Z^=H[nhi]
|
||||||
&psllq ($tmp,60);
|
&psllq ($tmp,60);
|
||||||
@ -762,9 +764,9 @@ sub mmx_loop() {
|
|||||||
&pxor ($Zhi,$red[1]);
|
&pxor ($Zhi,$red[1]);
|
||||||
|
|
||||||
&movd ($dat,$Zlo);
|
&movd ($dat,$Zlo);
|
||||||
&pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3);
|
&pinsrw ($red[2],&WP(0,$rem_8bit,$rem[0],2),3); # last is <<48
|
||||||
|
|
||||||
&psllq ($red[0],12);
|
&psllq ($red[0],12); # correct by <<16>>4
|
||||||
&pxor ($Zhi,$red[0]);
|
&pxor ($Zhi,$red[0]);
|
||||||
&psrlq ($Zlo,32);
|
&psrlq ($Zlo,32);
|
||||||
&pxor ($Zhi,$red[2]);
|
&pxor ($Zhi,$red[2]);
|
||||||
@ -1316,15 +1318,16 @@ my ($Xhi,$Xi)=@_;
|
|||||||
# per-invocation lookup table setup. Latter means that table size is
|
# per-invocation lookup table setup. Latter means that table size is
|
||||||
# chosen depending on how much data is to be hashed in every given call,
|
# chosen depending on how much data is to be hashed in every given call,
|
||||||
# more data - larger table. Best reported result for Core2 is ~4 cycles
|
# more data - larger table. Best reported result for Core2 is ~4 cycles
|
||||||
# per processed byte out of 64KB block. Recall that this number accounts
|
# per processed byte out of 64KB block. This number accounts even for
|
||||||
# even for 64KB table setup overhead. As discussed in gcm128.c we choose
|
# 64KB table setup overhead. As discussed in gcm128.c we choose to be
|
||||||
# to be more conservative in respect to lookup table sizes, but how
|
# more conservative in respect to lookup table sizes, but how do the
|
||||||
# do the results compare? Minimalistic "256B" MMX version delivers ~11
|
# results compare? Minimalistic "256B" MMX version delivers ~11 cycles
|
||||||
# cycles on same platform. As also discussed in gcm128.c, next in line
|
# on same platform. As also discussed in gcm128.c, next in line "8-bit
|
||||||
# "8-bit Shoup's" method should deliver twice the performance of "4-bit"
|
# Shoup's" or "4KB" method should deliver twice the performance of
|
||||||
# one. It should be also be noted that in SSE2 case improvement can be
|
# "256B" one, in other words not worse than ~6 cycles per byte. It
|
||||||
# "super-linear," i.e. more than twice, mostly because >>8 maps to
|
# should be also be noted that in SSE2 case improvement can be "super-
|
||||||
# single instruction on SSE2 register. This is unlike "4-bit" case when
|
# linear," i.e. more than twice, mostly because >>8 maps to single
|
||||||
# >>4 maps to same amount of instructions in both MMX and SSE2 cases.
|
# instruction on SSE2 register. This is unlike "4-bit" case when >>4
|
||||||
|
# maps to same amount of instructions in both MMX and SSE2 cases.
|
||||||
# Bottom line is that switch to SSE2 is considered to be justifiable
|
# Bottom line is that switch to SSE2 is considered to be justifiable
|
||||||
# only in case we choose to implement "8-bit" method...
|
# only in case we choose to implement "8-bit" method...
|
||||||
|
Loading…
x
Reference in New Issue
Block a user