openssl/crypto/rc4/asm/rc4-amd64.pl
2005-04-17 21:05:57 +00:00

161 lines
3.9 KiB
Raku
Executable File

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
# line results in ~40% improvement (yes, even for C implementation).
# Presumably it has everything to do with AMD cache architecture and
# RAW or whatever penalties. Once again! The module *requires* config
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
# I simply 'inc %r8b'. Even though optimization manual discourages
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...
# As was shown by Marc Bevand reordering of couple of load operations
# results in even higher performance gain of 3.3x:-) At least on
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
# Latter means that if you want to *estimate* what to expect from
# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.
# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 is to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details... This applies to 0.9.8 and later.
# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
# of code remain redundant.
$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
$inp="%rdx"; # arg3
$out="%rcx"; # arg4
$XX="%r10";
$TX="%r8";
$YY="%r11";
$TY="%r9";
$code=<<___;
.text
.globl RC4
.type RC4,\@function,4
.align 16
RC4: or $len,$len
jne .Lentry
ret
.Lentry:
add \$8,$dat
movl -8($dat),$XX#d
movl -4($dat),$YY#d
cmpl \$-1,256($dat)
je .LRC4_CHAR
test \$-8,$len
jz .Lloop1
.align 16
.Lloop8:
inc $XX#b
movl ($dat,$XX,4),$TX#d
add $TX#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX#d,($dat,$YY,4)
movl $TY#d,($dat,$XX,4)
add $TX#b,$TY#b
inc $XX#b
movl ($dat,$XX,4),$TX#d
movb ($dat,$TY,4),%al
___
for ($i=1;$i<=6;$i++) {
$code.=<<___;
add $TX#b,$YY#b
ror \$8,%rax
movl ($dat,$YY,4),$TY#d
movl $TX#d,($dat,$YY,4)
movl $TY#d,($dat,$XX,4)
add $TX#b,$TY#b
inc $XX#b
movl ($dat,$XX,4),$TX#d
movb ($dat,$TY,4),%al
___
}
$code.=<<___;
add $TX#b,$YY#b
ror \$8,%rax
movl ($dat,$YY,4),$TY#d
movl $TX#d,($dat,$YY,4)
movl $TY#d,($dat,$XX,4)
sub \$8,$len
add $TY#b,$TX#b
movb ($dat,$TX,4),%al
ror \$8,%rax
add \$8,$inp
add \$8,$out
xor -8($inp),%rax
mov %rax,-8($out)
test \$-8,$len
jnz .Lloop8
cmp \$0,$len
jne .Lloop1
.Lexit:
movl $XX#d,-8($dat)
movl $YY#d,-4($dat)
ret
.align 16
.Lloop1:
movzb ($inp),%eax
inc $XX#b
movl ($dat,$XX,4),$TX#d
add $TX#b,$YY#b
movl ($dat,$YY,4),$TY#d
movl $TX#d,($dat,$YY,4)
movl $TY#d,($dat,$XX,4)
add $TY#b,$TX#b
movl ($dat,$TX,4),$TY#d
xor $TY,%rax
inc $inp
movb %al,($out)
inc $out
dec $len
jnz .Lloop1
jmp .Lexit
.align 16
.LRC4_CHAR:
add \$1,$XX#b
movzb ($dat,$XX),$TX#d
add $TX#b,$YY#b
movzb ($dat,$YY),$TY#d
movb $TX#b,($dat,$YY)
movb $TY#b,($dat,$XX)
add $TX#b,$TY#b
movzb ($dat,$TY),$TY#d
xorb ($inp),$TY#b
movb $TY#b,($out)
lea 1($inp),$inp
lea 1($out),$out
sub \$1,$len
jnz .LRC4_CHAR
jmp .Lexit
.size RC4,.-RC4
___
$code =~ s/#([bwd])/$1/gm;
print $code;
close STDOUT;