openssl/crypto/rc4/asm/rc4-amd64.pl

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. Rights for redistribution and usage in source and binary
# forms are granted according to the OpenSSL license.
# ====================================================================
#
# 2.22x RC4 tune-up:-) It should be noted though that my hand [as in
# "hand-coded assembler"] doesn't stand for the whole improvement
# coefficient. It turned out that eliminating RC4_CHAR from config
# line results in ~40% improvement (yes, even for C implementation).
# Presumably it has everything to do with AMD cache architecture and
# RAW or whatever penalties. Once again! The module *requires* config
# line *without* RC4_CHAR! As for coding "secret," I bet on partial
# register arithmetics. For example instead of 'inc %r8; and $255,%r8'
# I simply 'inc %r8b'. Even though optimization manual discourages
# to operate on partial registers, it turned out to be the best bet.
# At least for AMD... How IA32E would perform remains to be seen...

# As was shown by Marc Bevand reordering of couple of load operations
# results in even higher performance gain of 3.3x:-) At least on
# Opteron... For reference, 1x in this case is RC4_CHAR C-code
# compiled with gcc 3.3.2, which performs at ~54MBps per 1GHz clock.
# Latter means that if you want to *estimate* what to expect from
# *your* CPU, then multiply 54 by 3.3 and clock frequency in GHz.

# Intel P4 EM64T core was found to run the AMD64 code really slow...
# The only way to achieve comparable performance on P4 is to keep
# RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to
# compose blended code, which would perform even within 30% marginal
# on either AMD and Intel platforms, I implement both cases. See
# rc4_skey.c for further details... This applies to 0.9.8 and later.
# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes
# of code remain redundant.

$output=shift;
open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";

$dat="%rdi";	    # arg1
$len="%rsi";	    # arg2
$inp="%rdx";	    # arg3
$out="%rcx";	    # arg4

$XX="%r10";
$TX="%r8";
$YY="%r11";
$TY="%r9";

$code=<<___;
.text

.globl	RC4
.type	RC4,\@function,4
.align	16
RC4:	or	$len,$len
	jne	.Lentry
	ret
.Lentry:
	add	\$8,$dat
	movl	-8($dat),$XX#d
	movl	-4($dat),$YY#d
	cmpl	\$-1,256($dat)
	je	.LRC4_CHAR
	test	\$-8,$len
	jz	.Lloop1
.align	16
.Lloop8:
	inc	$XX#b
	movl	($dat,$XX,4),$TX#d
	add	$TX#b,$YY#b
	movl	($dat,$YY,4),$TY#d
	movl	$TX#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX,4)
	add	$TX#b,$TY#b
	inc	$XX#b
	movl	($dat,$XX,4),$TX#d
	movb	($dat,$TY,4),%al
___
for ($i=1;$i<=6;$i++) {
$code.=<<___;
	add	$TX#b,$YY#b
	ror	\$8,%rax
	movl	($dat,$YY,4),$TY#d
	movl	$TX#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX,4)
	add	$TX#b,$TY#b
	inc	$XX#b
	movl	($dat,$XX,4),$TX#d
	movb	($dat,$TY,4),%al
___
}
$code.=<<___;
	add	$TX#b,$YY#b
	ror	\$8,%rax
	movl	($dat,$YY,4),$TY#d
	movl	$TX#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX,4)
	sub	\$8,$len
	add	$TY#b,$TX#b
	movb	($dat,$TX,4),%al
	ror	\$8,%rax
	add	\$8,$inp
	add	\$8,$out

	xor	-8($inp),%rax
	mov	%rax,-8($out)

	test	\$-8,$len
	jnz	.Lloop8
	cmp	\$0,$len
	jne	.Lloop1
.Lexit:
	movl	$XX#d,-8($dat)
	movl	$YY#d,-4($dat)
	ret
.align	16
.Lloop1:
	movzb	($inp),%eax
	inc	$XX#b
	movl	($dat,$XX,4),$TX#d
	add	$TX#b,$YY#b
	movl	($dat,$YY,4),$TY#d
	movl	$TX#d,($dat,$YY,4)
	movl	$TY#d,($dat,$XX,4)
	add	$TY#b,$TX#b
	movl	($dat,$TX,4),$TY#d
	xor	$TY,%rax
	inc	$inp
	movb	%al,($out)
	inc	$out
	dec	$len
	jnz	.Lloop1
	jmp	.Lexit

.align	16
.LRC4_CHAR:
	add	\$1,$XX#b
	movzb	($dat,$XX),$TX#d
	add	$TX#b,$YY#b
	movzb	($dat,$YY),$TY#d
	movb	$TX#b,($dat,$YY)
	movb	$TY#b,($dat,$XX)
	add	$TX#b,$TY#b
	movzb	($dat,$TY),$TY#d
	xorb	($inp),$TY#b
	movb	$TY#b,($out)
	lea	1($inp),$inp
	lea	1($out),$out
	sub	\$1,$len
	jnz	.LRC4_CHAR
	jmp	.Lexit
.size	RC4,.-RC4
___

$code =~ s/#([bwd])/$1/gm;

print $code;

close STDOUT;