RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's
apparently impossible to compose blended code with would perform satisfactory on all x86 and x86_64 cores, an extra RC4_CHAR code-path is introduced and P4 core is detected at run-time. This way we keep original performance on non-P4 implementations and turbo-charge P4 performance by factor of 2.8x (on 32-bit core).
This commit is contained in:
parent
00dd8f6d6e
commit
376729e130
@ -318,7 +318,7 @@ my %table=(
|
|||||||
"linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-s390x", "gcc:-DB_ENDIAN -DTERMIO -DNO_ASM -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK RC4_CHAR::asm/ia64.o::asm/aes-ia64.o:::asm/sha256-ia64.o asm/sha512-ia64.o:::::dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL::asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall -DMD32_REG_T=int::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL:amd64cpuid.o:asm/x86_64-gcc.o:::::::asm/rc4-amd64.o:::dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
"linux-elf-arm","gcc:-DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
|
||||||
"linux-parisc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
|
"linux-parisc", "gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT:::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::",
|
||||||
#### SPARC Linux setups
|
#### SPARC Linux setups
|
||||||
|
4
TABLE
4
TABLE
@ -2086,7 +2086,7 @@ $unistd =
|
|||||||
$thread_cflag = -D_REENTRANT
|
$thread_cflag = -D_REENTRANT
|
||||||
$sys_id =
|
$sys_id =
|
||||||
$lflags = -rdynamic -ldl
|
$lflags = -rdynamic -ldl
|
||||||
$bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
|
$bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT
|
||||||
$cpuid_obj = x86cpuid-elf.o
|
$cpuid_obj = x86cpuid-elf.o
|
||||||
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
|
$bn_obj = asm/bn86-elf.o asm/co86-elf.o
|
||||||
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
|
$des_obj = asm/dx86-elf.o asm/yx86-elf.o
|
||||||
@ -3572,7 +3572,7 @@ $thread_cflag = -D_REENTRANT
|
|||||||
$sys_id =
|
$sys_id =
|
||||||
$lflags = -ldl
|
$lflags = -ldl
|
||||||
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
|
$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK BF_PTR2 DES_INT DES_UNROLL
|
||||||
$cpuid_obj =
|
$cpuid_obj = amd64cpuid.o
|
||||||
$bn_obj = asm/x86_64-gcc.o
|
$bn_obj = asm/x86_64-gcc.o
|
||||||
$des_obj =
|
$des_obj =
|
||||||
$aes_obj =
|
$aes_obj =
|
||||||
|
@ -5,16 +5,63 @@ $win64a=1 if ($output =~ /win64a\.[s|asm]/);
|
|||||||
open STDOUT,">$output" || die "can't open $output: $!";
|
open STDOUT,">$output" || die "can't open $output: $!";
|
||||||
|
|
||||||
print<<___ if(defined($win64a));
|
print<<___ if(defined($win64a));
|
||||||
TEXT SEGMENT
|
_TEXT SEGMENT
|
||||||
PUBLIC OPENSSL_rdtsc
|
PUBLIC OPENSSL_rdtsc
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
OPENSSL_rdtsc PROC NEAR
|
OPENSSL_rdtsc PROC
|
||||||
rdtsc
|
rdtsc
|
||||||
shl rdx,32
|
shl rdx,32
|
||||||
or rax,rdx
|
or rax,rdx
|
||||||
ret
|
ret
|
||||||
OPENSSL_rdtsc ENDP
|
OPENSSL_rdtsc ENDP
|
||||||
TEXT ENDS
|
|
||||||
|
PUBLIC OPENSSL_atomic_add
|
||||||
|
ALIGN 16
|
||||||
|
OPENSSL_atomic_add PROC
|
||||||
|
mov eax,DWORD PTR[rcx]
|
||||||
|
\$Lspin: lea r8,DWORD PTR[rdx+rax]
|
||||||
|
lock cmpxchg DWORD PTR[rcx],r8d
|
||||||
|
jne \$Lspin
|
||||||
|
mov eax,r8d
|
||||||
|
cdqe
|
||||||
|
ret
|
||||||
|
OPENSSL_atomic_add ENDP
|
||||||
|
|
||||||
|
PUBLIC OPENSSL_wipe_cpu
|
||||||
|
ALIGN 16
|
||||||
|
OPENSSL_wipe_cpu PROC
|
||||||
|
pxor xmm0,xmm0
|
||||||
|
pxor xmm1,xmm1
|
||||||
|
pxor xmm2,xmm2
|
||||||
|
pxor xmm3,xmm3
|
||||||
|
pxor xmm4,xmm4
|
||||||
|
pxor xmm5,xmm5
|
||||||
|
xor rcx,rcx
|
||||||
|
xor rdx,rdx
|
||||||
|
xor r8,r8
|
||||||
|
xor r9,r9
|
||||||
|
xor r10,r10
|
||||||
|
xor r11,r11
|
||||||
|
lea rax,QWORD PTR[rsp+8]
|
||||||
|
ret
|
||||||
|
OPENSSL_wipe_cpu ENDP
|
||||||
|
|
||||||
|
OPENSSL_ia32_cpuid PROC
|
||||||
|
mov r8,rbx
|
||||||
|
mov eax,1
|
||||||
|
cpuid
|
||||||
|
shl rcx,32
|
||||||
|
mov eax,edx
|
||||||
|
mov rbx,r8
|
||||||
|
or rax,rcx
|
||||||
|
ret
|
||||||
|
OPENSSL_ia32_cpuid ENDP
|
||||||
|
_TEXT ENDS
|
||||||
|
|
||||||
|
CRT\$XIU SEGMENT
|
||||||
|
EXTRN OPENSSL_cpuid_setup:PROC
|
||||||
|
DQ OPENSSL_cpuid_setup
|
||||||
|
CRT\$XIU ENDS
|
||||||
END
|
END
|
||||||
___
|
___
|
||||||
print<<___ if(!defined($win64a));
|
print<<___ if(!defined($win64a));
|
||||||
@ -27,4 +74,66 @@ OPENSSL_rdtsc:
|
|||||||
or %rdx,%rax
|
or %rdx,%rax
|
||||||
ret
|
ret
|
||||||
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
|
.size OPENSSL_rdtsc,.-OPENSSL_rdtsc
|
||||||
|
|
||||||
|
.globl OPENSSL_atomic_add
|
||||||
|
.type OPENSSL_atomic_add,\@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_atomic_add:
|
||||||
|
movl (%rdi),%eax
|
||||||
|
.Lspin: lea (%rsi,%rax),%r8
|
||||||
|
lock; cmpxchg %r8d,(%rdi)
|
||||||
|
jne .Lspin
|
||||||
|
mov %r8d,%eax
|
||||||
|
cdqe
|
||||||
|
ret
|
||||||
|
.size OPENSSL_atomic_add,.-OPENSSL_atomic_add
|
||||||
|
|
||||||
|
.globl OPENSSL_wipe_cpu
|
||||||
|
.type OPENSSL_wipe_cpu,\@function
|
||||||
|
.align 16
|
||||||
|
OPENSSL_wipe_cpu:
|
||||||
|
pxor %xmm0,%xmm0
|
||||||
|
pxor %xmm1,%xmm1
|
||||||
|
pxor %xmm2,%xmm2
|
||||||
|
pxor %xmm3,%xmm3
|
||||||
|
pxor %xmm4,%xmm4
|
||||||
|
pxor %xmm5,%xmm5
|
||||||
|
pxor %xmm6,%xmm6
|
||||||
|
pxor %xmm7,%xmm7
|
||||||
|
pxor %xmm8,%xmm8
|
||||||
|
pxor %xmm9,%xmm9
|
||||||
|
pxor %xmm10,%xmm10
|
||||||
|
pxor %xmm11,%xmm11
|
||||||
|
pxor %xmm12,%xmm12
|
||||||
|
pxor %xmm13,%xmm13
|
||||||
|
pxor %xmm14,%xmm14
|
||||||
|
pxor %xmm15,%xmm15
|
||||||
|
xor %rcx,%rcx
|
||||||
|
xor %rdx,%rdx
|
||||||
|
xor %rsi,%rsi
|
||||||
|
xor %rdi,%rdi
|
||||||
|
xor %r8,%r8
|
||||||
|
xor %r9,%r9
|
||||||
|
xor %r10,%r10
|
||||||
|
xor %r11,%r11
|
||||||
|
lea 8(%rsp),%rax
|
||||||
|
ret
|
||||||
|
.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
|
||||||
|
|
||||||
|
.globl OPENSSL_ia32_cpuid
|
||||||
|
.align 16
|
||||||
|
OPENSSL_ia32_cpuid:
|
||||||
|
mov %rbx,%r8
|
||||||
|
mov \$1,%eax
|
||||||
|
cpuid
|
||||||
|
shl \$32,%rcx
|
||||||
|
mov %edx,%eax
|
||||||
|
mov %r8,%rbx
|
||||||
|
or %rcx,%rax
|
||||||
|
ret
|
||||||
|
.size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid
|
||||||
|
|
||||||
|
.section .init
|
||||||
|
call OPENSSL_cpuid_setup
|
||||||
|
.align 16
|
||||||
___
|
___
|
||||||
|
@ -161,7 +161,7 @@ sub main'shl { &out2("sall",@_); }
|
|||||||
sub main'shr { &out2("shrl",@_); }
|
sub main'shr { &out2("shrl",@_); }
|
||||||
sub main'xor { &out2("xorl",@_); }
|
sub main'xor { &out2("xorl",@_); }
|
||||||
sub main'xorb { &out2("xorb",@_); }
|
sub main'xorb { &out2("xorb",@_); }
|
||||||
sub main'add { &out2("addl",@_); }
|
sub main'add { &out2($_[0]=~/%[a-d][lh]/?"addb":"addl",@_); }
|
||||||
sub main'adc { &out2("adcl",@_); }
|
sub main'adc { &out2("adcl",@_); }
|
||||||
sub main'sub { &out2("subl",@_); }
|
sub main'sub { &out2("subl",@_); }
|
||||||
sub main'sbb { &out2("sbbl",@_); }
|
sub main'sbb { &out2("sbbl",@_); }
|
||||||
@ -189,7 +189,7 @@ sub main'jc { &out1("jc",@_); }
|
|||||||
sub main'jnc { &out1("jnc",@_); }
|
sub main'jnc { &out1("jnc",@_); }
|
||||||
sub main'jno { &out1("jno",@_); }
|
sub main'jno { &out1("jno",@_); }
|
||||||
sub main'dec { &out1("decl",@_); }
|
sub main'dec { &out1("decl",@_); }
|
||||||
sub main'inc { &out1("incl",@_); }
|
sub main'inc { &out1($_[0]=~/%[a-d][hl]/?"incb":"incl",@_); }
|
||||||
sub main'push { &out1("pushl",@_); $stack+=4; }
|
sub main'push { &out1("pushl",@_); $stack+=4; }
|
||||||
sub main'pop { &out1("popl",@_); $stack-=4; }
|
sub main'pop { &out1("popl",@_); $stack-=4; }
|
||||||
sub main'pushf { &out0("pushfl"); $stack+=4; }
|
sub main'pushf { &out0("pushfl"); $stack+=4; }
|
||||||
@ -205,9 +205,10 @@ sub main'nop { &out0("nop"); }
|
|||||||
sub main'test { &out2("testl",@_); }
|
sub main'test { &out2("testl",@_); }
|
||||||
sub main'bt { &out2("btl",@_); }
|
sub main'bt { &out2("btl",@_); }
|
||||||
sub main'leave { &out0("leave"); }
|
sub main'leave { &out0("leave"); }
|
||||||
sub main'cpuid { &out0(".byte 0x0f; .byte 0xa2"); }
|
sub main'cpuid { &out0(".byte\t0x0f,0xa2"); }
|
||||||
sub main'rdtsc { &out0(".byte 0x0f; .byte 0x31"); }
|
sub main'rdtsc { &out0(".byte\t0x0f,0x31"); }
|
||||||
sub main'halt { &out0("hlt"); }
|
sub main'halt { &out0("hlt"); }
|
||||||
|
sub main'movz { &out2("movzb",@_); }
|
||||||
|
|
||||||
# SSE2
|
# SSE2
|
||||||
sub main'emms { &out0("emms"); }
|
sub main'emms { &out0("emms"); }
|
||||||
@ -558,7 +559,7 @@ sub main'file_end
|
|||||||
pushl %ebx
|
pushl %ebx
|
||||||
movl %edx,%edi
|
movl %edx,%edi
|
||||||
movl \$1,%eax
|
movl \$1,%eax
|
||||||
.byte 0x0f; .byte 0xa2
|
.byte 0x0f,0xa2
|
||||||
orl \$1<<10,%edx
|
orl \$1<<10,%edx
|
||||||
movl %edx,0(%edi)
|
movl %edx,0(%edi)
|
||||||
popl %ebx
|
popl %ebx
|
||||||
|
@ -7,10 +7,10 @@ require "x86asm.pl";
|
|||||||
|
|
||||||
&asm_init($ARGV[0],"rc4-586.pl");
|
&asm_init($ARGV[0],"rc4-586.pl");
|
||||||
|
|
||||||
$tx="eax";
|
$x="eax";
|
||||||
$ty="ebx";
|
$y="ebx";
|
||||||
$x="ecx";
|
$tx="ecx";
|
||||||
$y="edx";
|
$ty="edx";
|
||||||
$in="esi";
|
$in="esi";
|
||||||
$out="edi";
|
$out="edi";
|
||||||
$d="ebp";
|
$d="ebp";
|
||||||
@ -31,7 +31,7 @@ sub RC4_loop
|
|||||||
{
|
{
|
||||||
&mov($ty, &swtmp(2));
|
&mov($ty, &swtmp(2));
|
||||||
&cmp($ty, $in);
|
&cmp($ty, $in);
|
||||||
&jle(&label("finished"));
|
&jbe(&label("finished"));
|
||||||
&inc($in);
|
&inc($in);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@ -39,7 +39,7 @@ sub RC4_loop
|
|||||||
&add($ty, 8);
|
&add($ty, 8);
|
||||||
&inc($in);
|
&inc($in);
|
||||||
&cmp($ty, $in);
|
&cmp($ty, $in);
|
||||||
&jl(&label("finished"));
|
&jb(&label("finished"));
|
||||||
&mov(&swtmp(2), $ty);
|
&mov(&swtmp(2), $ty);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -88,35 +88,44 @@ sub RC4
|
|||||||
|
|
||||||
&function_begin_B($name,"");
|
&function_begin_B($name,"");
|
||||||
|
|
||||||
|
&mov($ty,&wparam(1)); # len
|
||||||
|
&cmp($ty,0);
|
||||||
|
&jne(&label("proceed"));
|
||||||
|
&ret();
|
||||||
|
&set_label("proceed");
|
||||||
|
|
||||||
&comment("");
|
&comment("");
|
||||||
|
|
||||||
&push("ebp");
|
&push("ebp");
|
||||||
&push("ebx");
|
&push("ebx");
|
||||||
&mov( $d, &wparam(0)); # key
|
|
||||||
&mov( $ty, &wparam(1)); # num
|
|
||||||
&push("esi");
|
&push("esi");
|
||||||
&push("edi");
|
&push("edi");
|
||||||
|
&mov( $d, &wparam(0)); # key
|
||||||
|
&mov( $in, &wparam(2));
|
||||||
|
|
||||||
&mov( $x, &DWP(0,$d,"",1));
|
&mov( $x, &DWP(0,$d,"",1));
|
||||||
&mov( $y, &DWP(4,$d,"",1));
|
&mov( $y, &DWP(4,$d,"",1));
|
||||||
|
|
||||||
&mov( $in, &wparam(2));
|
&mov( $out, &wparam(3));
|
||||||
&inc( $x);
|
&inc( $x);
|
||||||
|
|
||||||
&stack_push(3); # 3 temp variables
|
&stack_push(3); # 3 temp variables
|
||||||
&add( $d, 8);
|
&add( $d, 8);
|
||||||
&and( $x, 0xff);
|
&and( $x, 0xff);
|
||||||
|
|
||||||
|
# detect compressed schedule, see commentary section in rc4_skey.c...
|
||||||
|
&cmp(&DWP(256,$d),-1);
|
||||||
|
&je(&label("RC4_CHAR"));
|
||||||
|
|
||||||
&lea( $ty, &DWP(-8,$ty,$in));
|
&lea( $ty, &DWP(-8,$ty,$in));
|
||||||
|
|
||||||
# check for 0 length input
|
# check for 0 length input
|
||||||
|
|
||||||
&mov( $out, &wparam(3));
|
|
||||||
&mov( &swtmp(2), $ty); # this is now address to exit at
|
&mov( &swtmp(2), $ty); # this is now address to exit at
|
||||||
&mov( $tx, &DWP(0,$d,$x,4));
|
&mov( $tx, &DWP(0,$d,$x,4));
|
||||||
|
|
||||||
&cmp( $ty, $in);
|
&cmp( $ty, $in);
|
||||||
&jl( &label("end")); # less than 8 bytes
|
&jb( &label("end")); # less than 8 bytes
|
||||||
|
|
||||||
&set_label("start");
|
&set_label("start");
|
||||||
|
|
||||||
@ -148,7 +157,7 @@ sub RC4
|
|||||||
&mov( &DWP(-4,$out,"",0), $tx);
|
&mov( &DWP(-4,$out,"",0), $tx);
|
||||||
&mov( $tx, &DWP(0,$d,$x,4));
|
&mov( $tx, &DWP(0,$d,$x,4));
|
||||||
&cmp($in, $ty);
|
&cmp($in, $ty);
|
||||||
&jle(&label("start"));
|
&jbe(&label("start"));
|
||||||
|
|
||||||
&set_label("end");
|
&set_label("end");
|
||||||
|
|
||||||
@ -162,6 +171,32 @@ sub RC4
|
|||||||
&RC4_loop(5,0,1);
|
&RC4_loop(5,0,1);
|
||||||
&RC4_loop(6,1,1);
|
&RC4_loop(6,1,1);
|
||||||
|
|
||||||
|
&jmp(&label("finished"));
|
||||||
|
|
||||||
|
&align(16);
|
||||||
|
# this is essentially Intel P4 specific codepath, see rc4_skey.c...
|
||||||
|
&set_label("RC4_CHAR");
|
||||||
|
|
||||||
|
&lea ($ty,&DWP(0,$in,$ty));
|
||||||
|
&mov (&swtmp(2),$ty);
|
||||||
|
|
||||||
|
# strangely enough unrolled loop performs over 20% slower...
|
||||||
|
&set_label("RC4_CHAR_loop");
|
||||||
|
&movz ($tx,&BP(0,$d,$x));
|
||||||
|
&add (&LB($y),&LB($tx));
|
||||||
|
&movz ($ty,&BP(0,$d,$y));
|
||||||
|
&movb (&BP(0,$d,$y),&LB($tx));
|
||||||
|
&movb (&BP(0,$d,$x),&LB($ty));
|
||||||
|
&add (&LB($ty),&LB($tx));
|
||||||
|
&movz ($ty,&BP(0,$d,$ty));
|
||||||
|
&xorb (&LB($ty),&BP(0,$in));
|
||||||
|
&movb (&BP(0,$out),&LB($ty));
|
||||||
|
&inc (&LB($x));
|
||||||
|
&inc ($in);
|
||||||
|
&inc ($out);
|
||||||
|
&cmp ($in,&swtmp(2));
|
||||||
|
&jb (&label("RC4_CHAR_loop"));
|
||||||
|
|
||||||
&set_label("finished");
|
&set_label("finished");
|
||||||
&dec( $x);
|
&dec( $x);
|
||||||
&stack_pop(3);
|
&stack_pop(3);
|
||||||
|
@ -25,6 +25,13 @@
|
|||||||
# Latter means that if you want to *estimate* what to expect from
|
# Latter means that if you want to *estimate* what to expect from
|
||||||
# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
|
# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
|
||||||
|
|
||||||
|
# Intel P4 EM64T core was found to run the AMD64 code really slow...
|
||||||
|
# The only way to achieve comparable performance on P4 is to keep
|
||||||
|
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
|
||||||
|
# compose blended code, which would perform even within 30% marginal
|
||||||
|
# on either AMD and Intel platforms, I implement both cases. See
|
||||||
|
# rc4_skey.c for further details...
|
||||||
|
|
||||||
$output=shift;
|
$output=shift;
|
||||||
|
|
||||||
$win64a=1 if ($output =~ /win64a.[s|asm]/);
|
$win64a=1 if ($output =~ /win64a.[s|asm]/);
|
||||||
@ -90,6 +97,8 @@ $code.=<<___;
|
|||||||
add \$8,$dat
|
add \$8,$dat
|
||||||
movl `&PTR("DWORD:-8[$dat]")`,$XX#d
|
movl `&PTR("DWORD:-8[$dat]")`,$XX#d
|
||||||
movl `&PTR("DWORD:-4[$dat]")`,$YY#d
|
movl `&PTR("DWORD:-4[$dat]")`,$YY#d
|
||||||
|
cmpl \$-1,`&PTR("DWORD:256[$dat]")`
|
||||||
|
je .LRC4_CHAR
|
||||||
test \$-8,$len
|
test \$-8,$len
|
||||||
jz .Lloop1
|
jz .Lloop1
|
||||||
.align 16
|
.align 16
|
||||||
@ -167,6 +176,24 @@ $code.=<<___;
|
|||||||
dec $len
|
dec $len
|
||||||
jnz .Lloop1
|
jnz .Lloop1
|
||||||
jmp .Lexit
|
jmp .Lexit
|
||||||
|
|
||||||
|
.align 16
|
||||||
|
.LRC4_CHAR:
|
||||||
|
inc $XX#b
|
||||||
|
movzb `&PTR("BYTE:[$dat+$XX]")`,$TX#d
|
||||||
|
add $TX#b,$YY#b
|
||||||
|
movzb `&PTR("BYTE:[$dat+$YY]")`,$TY#d
|
||||||
|
movb $TX#b,`&PTR("BYTE:[$dat+$YY]")`
|
||||||
|
movb $TY#b,`&PTR("BYTE:[$dat+$XX]")`
|
||||||
|
add $TX#b,$TY#b
|
||||||
|
movzb `&PTR("BYTE:[$dat+$TY]")`,$TY#d
|
||||||
|
xorb `&PTR("BYTE:[$inp]")`,$TY#b
|
||||||
|
movb $TY#b,`&PTR("BYTE:[$out]")`
|
||||||
|
inc $inp
|
||||||
|
inc $out
|
||||||
|
dec $len
|
||||||
|
jnz .LRC4_CHAR
|
||||||
|
jmp .Lexit
|
||||||
___
|
___
|
||||||
$code.=<<___ if (defined($win64a));
|
$code.=<<___ if (defined($win64a));
|
||||||
RC4 ENDP
|
RC4 ENDP
|
||||||
@ -189,6 +216,8 @@ if (defined($win64a)) {
|
|||||||
$code =~ s/mov[bwlq]/mov/gm;
|
$code =~ s/mov[bwlq]/mov/gm;
|
||||||
$code =~ s/movzb/movzx/gm;
|
$code =~ s/movzb/movzx/gm;
|
||||||
$code =~ s/repret/DB\t0F3h,0C3h/gm;
|
$code =~ s/repret/DB\t0F3h,0C3h/gm;
|
||||||
|
$code =~ s/cmpl/cmp/gm;
|
||||||
|
$code =~ s/xorb/xor/gm;
|
||||||
} else {
|
} else {
|
||||||
$code =~ s/([QD]*WORD|BYTE)://gm;
|
$code =~ s/([QD]*WORD|BYTE)://gm;
|
||||||
$code =~ s/repret/.byte\t0xF3,0xC3/gm;
|
$code =~ s/repret/.byte\t0xF3,0xC3/gm;
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#ifndef HEADER_RC4_LOCL_H
|
#ifndef HEADER_RC4_LOCL_H
|
||||||
#define HEADER_RC4_LOCL_H
|
#define HEADER_RC4_LOCL_H
|
||||||
#include <openssl/opensslconf.h>
|
#include <openssl/opensslconf.h>
|
||||||
|
#include <cryptlib.h>
|
||||||
#endif
|
#endif
|
||||||
|
@ -93,25 +93,58 @@ void RC4_set_key(RC4_KEY *key, int len, const unsigned char *data)
|
|||||||
unsigned int i;
|
unsigned int i;
|
||||||
|
|
||||||
d= &(key->data[0]);
|
d= &(key->data[0]);
|
||||||
for (i=0; i<256; i++)
|
|
||||||
d[i]=i;
|
|
||||||
key->x = 0;
|
key->x = 0;
|
||||||
key->y = 0;
|
key->y = 0;
|
||||||
id1=id2=0;
|
id1=id2=0;
|
||||||
|
|
||||||
#define SK_LOOP(n) { \
|
#define SK_LOOP(d,n) { \
|
||||||
tmp=d[(n)]; \
|
tmp=d[(n)]; \
|
||||||
id2 = (data[id1] + tmp + id2) & 0xff; \
|
id2 = (data[id1] + tmp + id2) & 0xff; \
|
||||||
if (++id1 == len) id1=0; \
|
if (++id1 == len) id1=0; \
|
||||||
d[(n)]=d[id2]; \
|
d[(n)]=d[id2]; \
|
||||||
d[id2]=tmp; }
|
d[id2]=tmp; }
|
||||||
|
|
||||||
|
#if defined(OPENSSL_CPUID_OBJ) && !defined(OPENSSL_NO_ASM)
|
||||||
|
# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
|
||||||
|
defined(__INTEL__) || \
|
||||||
|
defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64)
|
||||||
|
if (sizeof(RC4_INT) > 1) {
|
||||||
|
/*
|
||||||
|
* Unlike all other x86 [and x86_64] implementations,
|
||||||
|
* Intel P4 core [including EM64T] was found to perform
|
||||||
|
* poorly with wider RC4_INT. Performance improvement
|
||||||
|
* for IA-32 hand-coded assembler turned out to be 2.8x
|
||||||
|
* if re-coded for RC4_CHAR! It's however inappropriate
|
||||||
|
* to just switch to RC4_CHAR for x86[_64], as non-P4
|
||||||
|
* implementations suffer from significant performance
|
||||||
|
* losses then, e.g. PIII exhibits >2x deterioration,
|
||||||
|
* and so does Opteron. In order to assure optimal
|
||||||
|
* all-round performance, let us [try to] detect P4 at
|
||||||
|
* run-time by checking upon HTT bit in CPU capability
|
||||||
|
* vector and set up compressed key schedule, which is
|
||||||
|
* recognized by correspondingly updated assembler
|
||||||
|
* module...
|
||||||
|
* <appro@fy.chalmers.se>
|
||||||
|
*/
|
||||||
|
if (OPENSSL_ia32cap_P & (1<<28)) {
|
||||||
|
unsigned char *cp=(unsigned char *)d;
|
||||||
|
|
||||||
|
for (i=0;i<256;i++) cp[i]=i;
|
||||||
|
for (i=0;i<256;i++) SK_LOOP(cp,i);
|
||||||
|
/* mark schedule as compressed! */
|
||||||
|
d[256/sizeof(RC4_INT)]=-1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
# endif
|
||||||
|
#endif
|
||||||
|
for (i=0; i < 256; i++) d[i]=i;
|
||||||
for (i=0; i < 256; i+=4)
|
for (i=0; i < 256; i+=4)
|
||||||
{
|
{
|
||||||
SK_LOOP(i+0);
|
SK_LOOP(d,i+0);
|
||||||
SK_LOOP(i+1);
|
SK_LOOP(d,i+1);
|
||||||
SK_LOOP(i+2);
|
SK_LOOP(d,i+2);
|
||||||
SK_LOOP(i+3);
|
SK_LOOP(d,i+3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,11 +14,12 @@ OPENSSL_ia32cap
|
|||||||
Value returned by OPENSSL_ia32cap_loc() is address of a variable
|
Value returned by OPENSSL_ia32cap_loc() is address of a variable
|
||||||
containing IA-32 processor capabilities bit vector as it appears in EDX
|
containing IA-32 processor capabilities bit vector as it appears in EDX
|
||||||
register after executing CPUID instruction with EAX=1 input value (see
|
register after executing CPUID instruction with EAX=1 input value (see
|
||||||
Intel Application Note #241618). Naturally it's meaningful on IA-32
|
Intel Application Note #241618). Naturally it's meaningful on IA-32[E]
|
||||||
platforms only. The variable is normally set up automatically upon
|
platforms only. The variable is normally set up automatically upon
|
||||||
toolkit initialization, but can be manipulated afterwards to modify
|
toolkit initialization, but can be manipulated afterwards to modify
|
||||||
crypto library behaviour. For the moment of this writing only two bits
|
crypto library behaviour. For the moment of this writing three bits are
|
||||||
are significant, namely bit #26 denoting SSE2 support, and bit #4
|
significant, namely bit #28 denoting Hyperthreading, which is used to
|
||||||
|
distinguish Intel P4 core, bit #26 denoting SSE2 support, and bit #4
|
||||||
denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time
|
denoting presence of Time-Stamp Counter. Clearing bit #26 at run-time
|
||||||
for example disables high-performance SSE2 code present in the crypto
|
for example disables high-performance SSE2 code present in the crypto
|
||||||
library. You might have to do this if target OpenSSL application is
|
library. You might have to do this if target OpenSSL application is
|
||||||
|
Loading…
x
Reference in New Issue
Block a user