gcm128.c and assembler modules: change argument order for gcm_ghash_4bit.

ghash-x86*.pl: fix performance numbers for Core2, as it turned out
previous ones were "tainted" by variable clock frequency.
This commit is contained in:
Andy Polyakov 2010-04-14 19:04:51 +00:00
parent 8decc967dc
commit 4f39edbff1
6 changed files with 38 additions and 47 deletions

View File

@ -31,10 +31,10 @@ $Thi1="t5";
$Tlo1="t6"; $Tlo1="t6";
$rem="t7"; # $8 $rem="t7"; # $8
################# #################
$Xi="a0"; # $16 $Xi="a0"; # $16, input argument block
$Htbl="a1"; $Htbl="a1";
$inp="a2";
$len="a3";
$nlo="a4"; # $20 $nlo="a4"; # $20
$nhi="a5"; $nhi="a5";
$Zhi="t8"; $Zhi="t8";
@ -314,12 +314,6 @@ $code.=<<___;
.end gcm_gmult_4bit .end gcm_gmult_4bit
___ ___
# argument block for gcm_ghash_4bit
$inp="a0"; # $16
$len="a1";
$Xi ="a2";
$Htbl="a3";
$inhi="s0"; $inhi="s0";
$inlo="s1"; $inlo="s1";

View File

@ -142,13 +142,13 @@ gcm_ghash_4bit:
.prologue .prologue
{ .mmi; .save ar.pfs,prevfs { .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,4,4,0,8 alloc prevfs=ar.pfs,4,4,0,8
$ADDP inp=15,in0 // &inp[15] $ADDP inp=15,in2 // &inp[15]
mov rem_4bitp=ip } mov rem_4bitp=ip }
{ .mmi; $ADDP end=in1,in0 // &inp[len] { .mmi; $ADDP end=in3,in2 // &inp[len]
$ADDP Xi=15,in2 // &Xi[15] $ADDP Xi=15,in0 // &Xi[15]
.save ar.lc,prevlc .save ar.lc,prevlc
mov prevlc=ar.lc };; mov prevlc=ar.lc };;
{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo { .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo
mov mask0xf0=0xf0 mov mask0xf0=0xf0
.save pr,prevpr .save pr,prevpr
mov prevpr=pr } mov prevpr=pr }

View File

@ -54,10 +54,10 @@ $remi="%l5";
$Htblo="%l6"; $Htblo="%l6";
$cnt="%l7"; $cnt="%l7";
$inp="%i0"; # input arguments for gcm_ghash_4bit $Xi="%i0"; # input argument block
$len="%i1"; $Htbl="%i1";
$Xi="%i2"; $inp="%i2";
$Htbl="%i3"; $len="%i3";
$code.=<<___; $code.=<<___;
.section ".text",#alloc,#execinstr .section ".text",#alloc,#execinstr
@ -208,8 +208,6 @@ gcm_ghash_4bit:
.size gcm_ghash_4bit,(.-gcm_ghash_4bit) .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
___ ___
$Xi="%i0"; # input arguments for gcm_gmult_4bit
$Htbl="%i1";
undef $inp; undef $inp;
undef $len; undef $len;

View File

@ -23,7 +23,7 @@
# PIII 63 /77 16 24 # PIII 63 /77 16 24
# P4 96 /122 30 84(***) # P4 96 /122 30 84(***)
# Opteron 50 /71 21 30 # Opteron 50 /71 21 30
# Core2 63 /102 19 28 # Core2 54 /68 13 18
# #
# (*) gcc 3.4.x was observed to generate few percent slower code, # (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen, # which is one of reasons why 2.95.3 results were chosen,
@ -317,12 +317,12 @@ if ($unroll) {
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax")); &lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
&mov ($inp,&wparam(0)); # load in &mov ($Zhh,&wparam(0)); # load Xi
&mov ($Zlh,&wparam(1)); # load len &mov ($Htbl,&wparam(1)); # load Htable
&mov ($Zhh,&wparam(2)); # load Xi &mov ($inp,&wparam(2)); # load in
&mov ($Htbl,&wparam(3)); # load Htable &mov ($Zlh,&wparam(3)); # load len
&add ($Zlh,$inp); &add ($Zlh,$inp);
&mov (&wparam(1),$Zlh); # len to point at the end of input &mov (&wparam(3),$Zlh); # len to point at the end of input
&stack_push(4+1); # +1 for stack alignment &stack_push(4+1); # +1 for stack alignment
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16] &mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
&mov ($Zhl,&DWP(4,$Zhh)); &mov ($Zhl,&DWP(4,$Zhh));
@ -344,10 +344,10 @@ if ($unroll) {
&mmx_loop("esp","eax"); &mmx_loop("esp","eax");
&lea ($inp,&DWP(16,$inp)); &lea ($inp,&DWP(16,$inp));
&cmp ($inp,&wparam(1)); &cmp ($inp,&wparam(3));
&jb (&label("mmx_outer_loop")); &jb (&label("mmx_outer_loop"));
&mov ($inp,&wparam(2)); # load Xi &mov ($inp,&wparam(0)); # load Xi
&emms (); &emms ();
&mov (&DWP(12,$inp),$Zll); &mov (&DWP(12,$inp),$Zll);
&mov (&DWP(4,$inp),$Zhl); &mov (&DWP(4,$inp),$Zhl);
@ -359,12 +359,12 @@ if ($unroll) {
&set_label("x86",16); &set_label("x86",16);
} }
&stack_push(16+4+1); # +1 for 64-bit alignment &stack_push(16+4+1); # +1 for 64-bit alignment
&mov ($inp,&wparam(0)); # load in &mov ($Zll,&wparam(0)); # load Xi
&mov ("ecx",&wparam(1)); # load len &mov ($Htbl,&wparam(1)); # load Htable
&mov ($Zll,&wparam(2)); # load Xi &mov ($inp,&wparam(2)); # load in
&mov ($Htbl,&wparam(3)); # load Htable &mov ("ecx",&wparam(3)); # load len
&add ("ecx",$inp); &add ("ecx",$inp);
&mov (&wparam(1),"ecx"); &mov (&wparam(3),"ecx");
&mov ($Zhh,&DWP(0,$Zll)); # load Xi[16] &mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
&mov ($Zhl,&DWP(4,$Zll)); &mov ($Zhl,&DWP(4,$Zll));
@ -390,14 +390,14 @@ if ($unroll) {
&call ("_x86_gmult_4bit_inner"); &call ("_x86_gmult_4bit_inner");
} else { } else {
&x86_loop(0); &x86_loop(0);
&mov ($inp,&wparam(0)); &mov ($inp,&wparam(2));
} }
&lea ($inp,&DWP(16,$inp)); &lea ($inp,&DWP(16,$inp));
&cmp ($inp,&wparam(1)); &cmp ($inp,&wparam(3));
&mov (&wparam(0),$inp) if (!$unroll); &mov (&wparam(2),$inp) if (!$unroll);
&jb (&label("x86_outer_loop")); &jb (&label("x86_outer_loop"));
&mov ($inp,&wparam(2)); # load Xi &mov ($inp,&wparam(0)); # load Xi
&mov (&DWP(12,$inp),$Zll); &mov (&DWP(12,$inp),$Zll);
&mov (&DWP(8,$inp),$Zlh); &mov (&DWP(8,$inp),$Zlh);
&mov (&DWP(4,$inp),$Zhl); &mov (&DWP(4,$inp),$Zhl);

View File

@ -18,7 +18,7 @@
# gcc 3.4.x assembler # gcc 3.4.x assembler
# #
# Opteron 18.5 10.2 +80% # Opteron 18.5 10.2 +80%
# Core2 26.0 16.4 +58% # Core2 17.5 11.0 +59%
$flavour = shift; $flavour = shift;
$output = shift; $output = shift;
@ -41,10 +41,10 @@ $Zhi="%r9";
$tmp="%r10"; $tmp="%r10";
$rem_4bit = "%r11"; $rem_4bit = "%r11";
# per-function register layout
$Xi="%rdi"; $Xi="%rdi";
$Htbl="%rsi"; $Htbl="%rsi";
# per-function register layout
$cnt="%rcx"; $cnt="%rcx";
$rem="%rdx"; $rem="%rdx";
@ -159,10 +159,8 @@ ___
# per-function register layout # per-function register layout
$inp="%rdi"; $inp="%rdx";
$len="%rsi"; $len="%rcx";
$Xi="%rdx";
$Htbl="%rcx";
$cnt="%rbp"; $cnt="%rbp";
$rem="%r12"; $rem="%r12";

View File

@ -339,7 +339,7 @@ static const size_t rem_4bit[16] = {
PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560), PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) }; PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16]) static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
{ {
u128 Z; u128 Z;
int cnt = 15; int cnt = 15;
@ -410,7 +410,8 @@ static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
* mostly as reference and a placeholder for possible future * mostly as reference and a placeholder for possible future
* non-trivial optimization[s]... * non-trivial optimization[s]...
*/ */
static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16]) static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
const u8 *inp,size_t len)
{ {
u128 Z; u128 Z;
int cnt; int cnt;
@ -479,13 +480,13 @@ static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
} }
#endif #endif
#else #else
void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]); void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]); void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#endif #endif
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable) #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT) #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
#define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable) #define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
* trashing effect. In other words idea is to hash data while it's * trashing effect. In other words idea is to hash data while it's
* still in L1 cache after encryption pass... */ * still in L1 cache after encryption pass... */