gcm128.c and assembler modules: change argument order for gcm_ghash_4bit.
ghash-x86*.pl: fix performance numbers for Core2, as it turned out previous ones were "tainted" by variable clock frequency.
This commit is contained in:
parent
8decc967dc
commit
4f39edbff1
@ -31,10 +31,10 @@ $Thi1="t5";
|
|||||||
$Tlo1="t6";
|
$Tlo1="t6";
|
||||||
$rem="t7"; # $8
|
$rem="t7"; # $8
|
||||||
#################
|
#################
|
||||||
$Xi="a0"; # $16
|
$Xi="a0"; # $16, input argument block
|
||||||
$Htbl="a1";
|
$Htbl="a1";
|
||||||
|
$inp="a2";
|
||||||
|
$len="a3";
|
||||||
$nlo="a4"; # $20
|
$nlo="a4"; # $20
|
||||||
$nhi="a5";
|
$nhi="a5";
|
||||||
$Zhi="t8";
|
$Zhi="t8";
|
||||||
@ -314,12 +314,6 @@ $code.=<<___;
|
|||||||
.end gcm_gmult_4bit
|
.end gcm_gmult_4bit
|
||||||
___
|
___
|
||||||
|
|
||||||
# argument block for gcm_ghash_4bit
|
|
||||||
$inp="a0"; # $16
|
|
||||||
$len="a1";
|
|
||||||
$Xi ="a2";
|
|
||||||
$Htbl="a3";
|
|
||||||
|
|
||||||
$inhi="s0";
|
$inhi="s0";
|
||||||
$inlo="s1";
|
$inlo="s1";
|
||||||
|
|
||||||
|
@ -142,13 +142,13 @@ gcm_ghash_4bit:
|
|||||||
.prologue
|
.prologue
|
||||||
{ .mmi; .save ar.pfs,prevfs
|
{ .mmi; .save ar.pfs,prevfs
|
||||||
alloc prevfs=ar.pfs,4,4,0,8
|
alloc prevfs=ar.pfs,4,4,0,8
|
||||||
$ADDP inp=15,in0 // &inp[15]
|
$ADDP inp=15,in2 // &inp[15]
|
||||||
mov rem_4bitp=ip }
|
mov rem_4bitp=ip }
|
||||||
{ .mmi; $ADDP end=in1,in0 // &inp[len]
|
{ .mmi; $ADDP end=in3,in2 // &inp[len]
|
||||||
$ADDP Xi=15,in2 // &Xi[15]
|
$ADDP Xi=15,in0 // &Xi[15]
|
||||||
.save ar.lc,prevlc
|
.save ar.lc,prevlc
|
||||||
mov prevlc=ar.lc };;
|
mov prevlc=ar.lc };;
|
||||||
{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo
|
{ .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo
|
||||||
mov mask0xf0=0xf0
|
mov mask0xf0=0xf0
|
||||||
.save pr,prevpr
|
.save pr,prevpr
|
||||||
mov prevpr=pr }
|
mov prevpr=pr }
|
||||||
|
@ -54,10 +54,10 @@ $remi="%l5";
|
|||||||
$Htblo="%l6";
|
$Htblo="%l6";
|
||||||
$cnt="%l7";
|
$cnt="%l7";
|
||||||
|
|
||||||
$inp="%i0"; # input arguments for gcm_ghash_4bit
|
$Xi="%i0"; # input argument block
|
||||||
$len="%i1";
|
$Htbl="%i1";
|
||||||
$Xi="%i2";
|
$inp="%i2";
|
||||||
$Htbl="%i3";
|
$len="%i3";
|
||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.section ".text",#alloc,#execinstr
|
.section ".text",#alloc,#execinstr
|
||||||
@ -208,8 +208,6 @@ gcm_ghash_4bit:
|
|||||||
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
|
||||||
___
|
___
|
||||||
|
|
||||||
$Xi="%i0"; # input arguments for gcm_gmult_4bit
|
|
||||||
$Htbl="%i1";
|
|
||||||
undef $inp;
|
undef $inp;
|
||||||
undef $len;
|
undef $len;
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@
|
|||||||
# PIII 63 /77 16 24
|
# PIII 63 /77 16 24
|
||||||
# P4 96 /122 30 84(***)
|
# P4 96 /122 30 84(***)
|
||||||
# Opteron 50 /71 21 30
|
# Opteron 50 /71 21 30
|
||||||
# Core2 63 /102 19 28
|
# Core2 54 /68 13 18
|
||||||
#
|
#
|
||||||
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
# (*) gcc 3.4.x was observed to generate few percent slower code,
|
||||||
# which is one of reasons why 2.95.3 results were chosen,
|
# which is one of reasons why 2.95.3 results were chosen,
|
||||||
@ -317,12 +317,12 @@ if ($unroll) {
|
|||||||
|
|
||||||
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
|
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
|
||||||
|
|
||||||
&mov ($inp,&wparam(0)); # load in
|
&mov ($Zhh,&wparam(0)); # load Xi
|
||||||
&mov ($Zlh,&wparam(1)); # load len
|
&mov ($Htbl,&wparam(1)); # load Htable
|
||||||
&mov ($Zhh,&wparam(2)); # load Xi
|
&mov ($inp,&wparam(2)); # load in
|
||||||
&mov ($Htbl,&wparam(3)); # load Htable
|
&mov ($Zlh,&wparam(3)); # load len
|
||||||
&add ($Zlh,$inp);
|
&add ($Zlh,$inp);
|
||||||
&mov (&wparam(1),$Zlh); # len to point at the end of input
|
&mov (&wparam(3),$Zlh); # len to point at the end of input
|
||||||
&stack_push(4+1); # +1 for stack alignment
|
&stack_push(4+1); # +1 for stack alignment
|
||||||
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
|
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
|
||||||
&mov ($Zhl,&DWP(4,$Zhh));
|
&mov ($Zhl,&DWP(4,$Zhh));
|
||||||
@ -344,10 +344,10 @@ if ($unroll) {
|
|||||||
&mmx_loop("esp","eax");
|
&mmx_loop("esp","eax");
|
||||||
|
|
||||||
&lea ($inp,&DWP(16,$inp));
|
&lea ($inp,&DWP(16,$inp));
|
||||||
&cmp ($inp,&wparam(1));
|
&cmp ($inp,&wparam(3));
|
||||||
&jb (&label("mmx_outer_loop"));
|
&jb (&label("mmx_outer_loop"));
|
||||||
|
|
||||||
&mov ($inp,&wparam(2)); # load Xi
|
&mov ($inp,&wparam(0)); # load Xi
|
||||||
&emms ();
|
&emms ();
|
||||||
&mov (&DWP(12,$inp),$Zll);
|
&mov (&DWP(12,$inp),$Zll);
|
||||||
&mov (&DWP(4,$inp),$Zhl);
|
&mov (&DWP(4,$inp),$Zhl);
|
||||||
@ -359,12 +359,12 @@ if ($unroll) {
|
|||||||
&set_label("x86",16);
|
&set_label("x86",16);
|
||||||
}
|
}
|
||||||
&stack_push(16+4+1); # +1 for 64-bit alignment
|
&stack_push(16+4+1); # +1 for 64-bit alignment
|
||||||
&mov ($inp,&wparam(0)); # load in
|
&mov ($Zll,&wparam(0)); # load Xi
|
||||||
&mov ("ecx",&wparam(1)); # load len
|
&mov ($Htbl,&wparam(1)); # load Htable
|
||||||
&mov ($Zll,&wparam(2)); # load Xi
|
&mov ($inp,&wparam(2)); # load in
|
||||||
&mov ($Htbl,&wparam(3)); # load Htable
|
&mov ("ecx",&wparam(3)); # load len
|
||||||
&add ("ecx",$inp);
|
&add ("ecx",$inp);
|
||||||
&mov (&wparam(1),"ecx");
|
&mov (&wparam(3),"ecx");
|
||||||
|
|
||||||
&mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
|
&mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
|
||||||
&mov ($Zhl,&DWP(4,$Zll));
|
&mov ($Zhl,&DWP(4,$Zll));
|
||||||
@ -390,14 +390,14 @@ if ($unroll) {
|
|||||||
&call ("_x86_gmult_4bit_inner");
|
&call ("_x86_gmult_4bit_inner");
|
||||||
} else {
|
} else {
|
||||||
&x86_loop(0);
|
&x86_loop(0);
|
||||||
&mov ($inp,&wparam(0));
|
&mov ($inp,&wparam(2));
|
||||||
}
|
}
|
||||||
&lea ($inp,&DWP(16,$inp));
|
&lea ($inp,&DWP(16,$inp));
|
||||||
&cmp ($inp,&wparam(1));
|
&cmp ($inp,&wparam(3));
|
||||||
&mov (&wparam(0),$inp) if (!$unroll);
|
&mov (&wparam(2),$inp) if (!$unroll);
|
||||||
&jb (&label("x86_outer_loop"));
|
&jb (&label("x86_outer_loop"));
|
||||||
|
|
||||||
&mov ($inp,&wparam(2)); # load Xi
|
&mov ($inp,&wparam(0)); # load Xi
|
||||||
&mov (&DWP(12,$inp),$Zll);
|
&mov (&DWP(12,$inp),$Zll);
|
||||||
&mov (&DWP(8,$inp),$Zlh);
|
&mov (&DWP(8,$inp),$Zlh);
|
||||||
&mov (&DWP(4,$inp),$Zhl);
|
&mov (&DWP(4,$inp),$Zhl);
|
||||||
|
@ -18,7 +18,7 @@
|
|||||||
# gcc 3.4.x assembler
|
# gcc 3.4.x assembler
|
||||||
#
|
#
|
||||||
# Opteron 18.5 10.2 +80%
|
# Opteron 18.5 10.2 +80%
|
||||||
# Core2 26.0 16.4 +58%
|
# Core2 17.5 11.0 +59%
|
||||||
|
|
||||||
$flavour = shift;
|
$flavour = shift;
|
||||||
$output = shift;
|
$output = shift;
|
||||||
@ -41,10 +41,10 @@ $Zhi="%r9";
|
|||||||
$tmp="%r10";
|
$tmp="%r10";
|
||||||
$rem_4bit = "%r11";
|
$rem_4bit = "%r11";
|
||||||
|
|
||||||
# per-function register layout
|
|
||||||
$Xi="%rdi";
|
$Xi="%rdi";
|
||||||
$Htbl="%rsi";
|
$Htbl="%rsi";
|
||||||
|
|
||||||
|
# per-function register layout
|
||||||
$cnt="%rcx";
|
$cnt="%rcx";
|
||||||
$rem="%rdx";
|
$rem="%rdx";
|
||||||
|
|
||||||
@ -159,10 +159,8 @@ ___
|
|||||||
|
|
||||||
|
|
||||||
# per-function register layout
|
# per-function register layout
|
||||||
$inp="%rdi";
|
$inp="%rdx";
|
||||||
$len="%rsi";
|
$len="%rcx";
|
||||||
$Xi="%rdx";
|
|
||||||
$Htbl="%rcx";
|
|
||||||
|
|
||||||
$cnt="%rbp";
|
$cnt="%rbp";
|
||||||
$rem="%r12";
|
$rem="%r12";
|
||||||
|
@ -339,7 +339,7 @@ static const size_t rem_4bit[16] = {
|
|||||||
PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
|
PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
|
||||||
PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
|
PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
|
||||||
|
|
||||||
static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
|
static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
|
||||||
{
|
{
|
||||||
u128 Z;
|
u128 Z;
|
||||||
int cnt = 15;
|
int cnt = 15;
|
||||||
@ -410,7 +410,8 @@ static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
|
|||||||
* mostly as reference and a placeholder for possible future
|
* mostly as reference and a placeholder for possible future
|
||||||
* non-trivial optimization[s]...
|
* non-trivial optimization[s]...
|
||||||
*/
|
*/
|
||||||
static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
|
static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
|
||||||
|
const u8 *inp,size_t len)
|
||||||
{
|
{
|
||||||
u128 Z;
|
u128 Z;
|
||||||
int cnt;
|
int cnt;
|
||||||
@ -479,13 +480,13 @@ static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]);
|
void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
|
||||||
void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]);
|
void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
|
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
|
||||||
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
|
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
|
||||||
#define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable)
|
#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
|
||||||
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
|
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
|
||||||
* trashing effect. In other words idea is to hash data while it's
|
* trashing effect. In other words idea is to hash data while it's
|
||||||
* still in L1 cache after encryption pass... */
|
* still in L1 cache after encryption pass... */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user