Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than
sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse to "cpuid" assembler module and gain 2x.
This commit is contained in:
parent
932cc129ee
commit
b2dba9bf1f
@ -1209,6 +1209,7 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/);
|
|||||||
|
|
||||||
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
|
$cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/);
|
||||||
|
|
||||||
|
$cpuid_obj="mem_clr.o" unless ($cpuid_obj =~ /\.o$/);
|
||||||
$des_obj=$des_enc unless ($des_obj =~ /\.o$/);
|
$des_obj=$des_enc unless ($des_obj =~ /\.o$/);
|
||||||
$bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/);
|
$bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/);
|
||||||
$cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/);
|
$cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/);
|
||||||
@ -1481,7 +1482,7 @@ print OUT "#ifdef OPENSSL_ALGORITHM_DEFINES\n";
|
|||||||
print OUT $openssl_algorithm_defines_trans;
|
print OUT $openssl_algorithm_defines_trans;
|
||||||
print OUT "#endif\n\n";
|
print OUT "#endif\n\n";
|
||||||
|
|
||||||
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj);
|
print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o");
|
||||||
|
|
||||||
while (<IN>)
|
while (<IN>)
|
||||||
{
|
{
|
||||||
|
@ -34,7 +34,7 @@ GENERAL=Makefile README crypto-lib.com install.com
|
|||||||
LIB= $(TOP)/libcrypto.a
|
LIB= $(TOP)/libcrypto.a
|
||||||
SHARED_LIB= libcrypto$(SHLIB_EXT)
|
SHARED_LIB= libcrypto$(SHLIB_EXT)
|
||||||
LIBSRC= cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
|
LIBSRC= cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c
|
||||||
LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
|
LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ)
|
||||||
|
|
||||||
SRC= $(LIBSRC)
|
SRC= $(LIBSRC)
|
||||||
|
|
||||||
|
@ -1,11 +1,13 @@
|
|||||||
// Works on all IA-64 platforms: Linux, HP-UX, Win64i...
|
// Works on all IA-64 platforms: Linux, HP-UX, Win64i...
|
||||||
// On Win64i compile with ias.exe.
|
// On Win64i compile with ias.exe.
|
||||||
.text
|
.text
|
||||||
|
|
||||||
.global OPENSSL_cpuid_setup#
|
.global OPENSSL_cpuid_setup#
|
||||||
.proc OPENSSL_cpuid_setup#
|
.proc OPENSSL_cpuid_setup#
|
||||||
OPENSSL_cpuid_setup:
|
OPENSSL_cpuid_setup:
|
||||||
{ .mib; br.ret.sptk.many b0 };;
|
{ .mib; br.ret.sptk.many b0 };;
|
||||||
.endp OPENSSL_cpuid_setup#
|
.endp OPENSSL_cpuid_setup#
|
||||||
|
|
||||||
.global OPENSSL_rdtsc#
|
.global OPENSSL_rdtsc#
|
||||||
.proc OPENSSL_rdtsc#
|
.proc OPENSSL_rdtsc#
|
||||||
OPENSSL_rdtsc:
|
OPENSSL_rdtsc:
|
||||||
@ -124,3 +126,37 @@ OPENSSL_wipe_cpu:
|
|||||||
mov ar.lc=r3
|
mov ar.lc=r3
|
||||||
br.ret.sptk b0 };;
|
br.ret.sptk b0 };;
|
||||||
.endp OPENSSL_wipe_cpu#
|
.endp OPENSSL_wipe_cpu#
|
||||||
|
|
||||||
|
.global OPENSSL_cleanse#
|
||||||
|
.proc OPENSSL_cleanse#
|
||||||
|
OPENSSL_cleanse:
|
||||||
|
{ .mib; and r2=7,r32
|
||||||
|
cmp.leu p6,p0=15,r33 // len>=15
|
||||||
|
(p6) br.cond.dptk .Lot };;
|
||||||
|
|
||||||
|
.Little:
|
||||||
|
{ .mib; st1 [r32]=r0,1
|
||||||
|
cmp.ltu p6,p7=1,r33 } // len>1
|
||||||
|
{ .mbb; add r33=-1,r33 // len--
|
||||||
|
(p6) br.cond.dptk .Little
|
||||||
|
(p7) br.ret.sptk.many b0 };;
|
||||||
|
|
||||||
|
.Lot:
|
||||||
|
{ .mib; cmp.eq p6,p0=0,r2
|
||||||
|
(p6) br.cond.dptk .Laligned };;
|
||||||
|
{ .mmi; st1 [r32]=r0,1;;
|
||||||
|
and r2=7,r32 }
|
||||||
|
{ .mib; add r33=-1,r33
|
||||||
|
br .Lot };;
|
||||||
|
|
||||||
|
.Laligned:
|
||||||
|
{ .mmi; st8 [r32]=r0,8
|
||||||
|
and r2=-8,r33 // len&~7
|
||||||
|
add r33=-8,r33 };; // len-=8
|
||||||
|
{ .mib; cmp.ltu p6,p0=8,r2 // ((len+8)&~7)>8
|
||||||
|
(p6) br.cond.dptk .Laligned };;
|
||||||
|
|
||||||
|
{ .mbb; cmp.eq p6,p7=r0,r33
|
||||||
|
(p7) br.cond.dpnt .Little
|
||||||
|
(p6) br.ret.sptk.many b0 };;
|
||||||
|
.endp OPENSSL_cleanse#
|
||||||
|
10
crypto/mem.c
10
crypto/mem.c
@ -250,7 +250,6 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int),
|
|||||||
void *CRYPTO_malloc_locked(int num, const char *file, int line)
|
void *CRYPTO_malloc_locked(int num, const char *file, int line)
|
||||||
{
|
{
|
||||||
void *ret = NULL;
|
void *ret = NULL;
|
||||||
extern unsigned char cleanse_ctr;
|
|
||||||
|
|
||||||
if (num <= 0) return NULL;
|
if (num <= 0) return NULL;
|
||||||
|
|
||||||
@ -267,11 +266,15 @@ void *CRYPTO_malloc_locked(int num, const char *file, int line)
|
|||||||
if (malloc_debug_func != NULL)
|
if (malloc_debug_func != NULL)
|
||||||
malloc_debug_func(ret, num, file, line, 1);
|
malloc_debug_func(ret, num, file, line, 1);
|
||||||
|
|
||||||
|
#ifndef OPENSSL_CPUID_OBJ
|
||||||
/* Create a dependency on the value of 'cleanse_ctr' so our memory
|
/* Create a dependency on the value of 'cleanse_ctr' so our memory
|
||||||
* sanitisation function can't be optimised out. NB: We only do
|
* sanitisation function can't be optimised out. NB: We only do
|
||||||
* this for >2Kb so the overhead doesn't bother us. */
|
* this for >2Kb so the overhead doesn't bother us. */
|
||||||
if(ret && (num > 2048))
|
if(ret && (num > 2048))
|
||||||
|
{ extern unsigned char cleanse_ctr;
|
||||||
((unsigned char *)ret)[0] = cleanse_ctr;
|
((unsigned char *)ret)[0] = cleanse_ctr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
@ -291,7 +294,6 @@ void CRYPTO_free_locked(void *str)
|
|||||||
void *CRYPTO_malloc(int num, const char *file, int line)
|
void *CRYPTO_malloc(int num, const char *file, int line)
|
||||||
{
|
{
|
||||||
void *ret = NULL;
|
void *ret = NULL;
|
||||||
extern unsigned char cleanse_ctr;
|
|
||||||
|
|
||||||
if (num <= 0) return NULL;
|
if (num <= 0) return NULL;
|
||||||
|
|
||||||
@ -308,11 +310,15 @@ void *CRYPTO_malloc(int num, const char *file, int line)
|
|||||||
if (malloc_debug_func != NULL)
|
if (malloc_debug_func != NULL)
|
||||||
malloc_debug_func(ret, num, file, line, 1);
|
malloc_debug_func(ret, num, file, line, 1);
|
||||||
|
|
||||||
|
#ifndef OPENSSL_CPUID_OBJ
|
||||||
/* Create a dependency on the value of 'cleanse_ctr' so our memory
|
/* Create a dependency on the value of 'cleanse_ctr' so our memory
|
||||||
* sanitisation function can't be optimised out. NB: We only do
|
* sanitisation function can't be optimised out. NB: We only do
|
||||||
* this for >2Kb so the overhead doesn't bother us. */
|
* this for >2Kb so the overhead doesn't bother us. */
|
||||||
if(ret && (num > 2048))
|
if(ret && (num > 2048))
|
||||||
|
{ extern unsigned char cleanse_ctr;
|
||||||
((unsigned char *)ret)[0] = cleanse_ctr;
|
((unsigned char *)ret)[0] = cleanse_ctr;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -232,6 +232,54 @@ _sparcv9_rdtick:
|
|||||||
.type _sparcv9_rdtick,#function
|
.type _sparcv9_rdtick,#function
|
||||||
.size _sparcv9_rdtick,.-_sparcv9_rdtick
|
.size _sparcv9_rdtick,.-_sparcv9_rdtick
|
||||||
|
|
||||||
|
.global OPENSSL_cleanse
|
||||||
|
.align 32
|
||||||
|
OPENSSL_cleanse:
|
||||||
|
cmp %o1,6
|
||||||
|
nop
|
||||||
|
#ifdef ABI64
|
||||||
|
bgu %xcc,.Lot
|
||||||
|
#else
|
||||||
|
bgu .Lot
|
||||||
|
#endif
|
||||||
|
nop
|
||||||
|
|
||||||
|
.Little:
|
||||||
|
stb %g0,[%o0]
|
||||||
|
subcc %o1,1,%o1
|
||||||
|
bnz .Little
|
||||||
|
add %o0,1,%o0
|
||||||
|
retl
|
||||||
|
nop
|
||||||
|
.align 32
|
||||||
|
.Lot:
|
||||||
|
andcc %o0,3,%g0
|
||||||
|
bz .Laligned
|
||||||
|
nop
|
||||||
|
stb %g0,[%o0]
|
||||||
|
sub %o1,1,%o1
|
||||||
|
ba .Lot
|
||||||
|
add %o0,1,%o0
|
||||||
|
nop
|
||||||
|
.Laligned:
|
||||||
|
st %g0,[%o0]
|
||||||
|
sub %o1,4,%o1
|
||||||
|
andcc %o1,-4,%g0
|
||||||
|
#ifdef ABI64
|
||||||
|
bnz %xcc,.Laligned
|
||||||
|
#else
|
||||||
|
bnz .Laligned
|
||||||
|
#endif
|
||||||
|
add %o0,4,%o0
|
||||||
|
|
||||||
|
cmp %o1,0
|
||||||
|
bne .Little
|
||||||
|
nop
|
||||||
|
retl
|
||||||
|
nop
|
||||||
|
.type OPENSSL_cleanse,#function
|
||||||
|
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||||||
|
|
||||||
.section ".init",#alloc,#execinstr
|
.section ".init",#alloc,#execinstr
|
||||||
call OPENSSL_cpuid_setup
|
call OPENSSL_cpuid_setup
|
||||||
nop
|
nop
|
||||||
|
@ -155,4 +155,36 @@ OPENSSL_ia32_cpuid:
|
|||||||
or %rcx,%rax
|
or %rcx,%rax
|
||||||
ret
|
ret
|
||||||
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
||||||
|
|
||||||
|
.globl OPENSSL_cleanse
|
||||||
|
.type OPENSSL_cleanse,\@function,2
|
||||||
|
.align 16
|
||||||
|
OPENSSL_cleanse:
|
||||||
|
xor %rax,%rax
|
||||||
|
cmp \$15,%rsi
|
||||||
|
jae .Lot
|
||||||
|
.Little:
|
||||||
|
mov %al,(%rdi)
|
||||||
|
sub \$1,%rsi
|
||||||
|
lea 1(%rdi),%rdi
|
||||||
|
jnz .Little
|
||||||
|
ret
|
||||||
|
.align 16
|
||||||
|
.Lot:
|
||||||
|
test \$7,%rdi
|
||||||
|
jz .Laligned
|
||||||
|
mov %al,(%rdi)
|
||||||
|
lea -1(%rsi),%rsi
|
||||||
|
lea 1(%rdi),%rdi
|
||||||
|
jmp .Lot
|
||||||
|
.Laligned:
|
||||||
|
mov %rax,(%rdi)
|
||||||
|
lea -8(%rsi),%rsi
|
||||||
|
test \$-8,%rsi
|
||||||
|
lea 8(%rdi),%rdi
|
||||||
|
jnz .Laligned
|
||||||
|
cmp \$0,%rsi
|
||||||
|
jne .Little
|
||||||
|
ret
|
||||||
|
.size OPENSSL_cleanse,.-OPENSSL_cleanse
|
||||||
___
|
___
|
||||||
|
@ -216,6 +216,37 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
|
|||||||
}
|
}
|
||||||
&function_end_B("OPENSSL_indirect_call");
|
&function_end_B("OPENSSL_indirect_call");
|
||||||
|
|
||||||
|
&function_begin_B("OPENSSL_cleanse");
|
||||||
|
&mov ("edx",&wparam(0));
|
||||||
|
&mov ("ecx",&wparam(1));
|
||||||
|
&xor ("eax","eax");
|
||||||
|
&cmp ("ecx",7);
|
||||||
|
&jae (&label("lot"));
|
||||||
|
&set_label("little");
|
||||||
|
&mov (&BP(0,"edx"),"al");
|
||||||
|
&sub ("ecx",1);
|
||||||
|
&lea ("edx",&DWP(1,"edx"));
|
||||||
|
&jnz (&label("little"));
|
||||||
|
&ret ();
|
||||||
|
|
||||||
|
&set_label("lot",16);
|
||||||
|
&test ("edx",3);
|
||||||
|
&jz (&label("aligned"));
|
||||||
|
&mov (&BP(0,"edx"),"al");
|
||||||
|
&lea ("ecx",&DWP(-1,"ecx"));
|
||||||
|
&lea ("edx",&DWP(1,"edx"));
|
||||||
|
&jmp (&label("lot"));
|
||||||
|
&set_label("aligned");
|
||||||
|
&mov (&DWP(0,"edx"),"eax");
|
||||||
|
&lea ("ecx",&DWP(-4,"ecx"));
|
||||||
|
&test ("ecx",-4);
|
||||||
|
&lea ("edx",&DWP(4,"edx"));
|
||||||
|
&jnz (&label("aligned"));
|
||||||
|
&cmp ("ecx",0);
|
||||||
|
&jne (&label("little"));
|
||||||
|
&ret ();
|
||||||
|
&function_end_B("OPENSSL_cleanse");
|
||||||
|
|
||||||
&initseg("OPENSSL_cpuid_setup");
|
&initseg("OPENSSL_cpuid_setup");
|
||||||
|
|
||||||
&asm_finish();
|
&asm_finish();
|
||||||
|
Loading…
x
Reference in New Issue
Block a user