From 496f2b148b53598233174414a2b446e9d2a76019 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 4 May 2014 16:39:59 +0200 Subject: [PATCH] C64x+ assembply pack: add RC4 module. --- Configure | 2 +- TABLE | 4 +- crypto/rc4/asm/rc4-c64xplus.pl | 183 +++++++++++++++++++++++++++++++++ 3 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 crypto/rc4/asm/rc4-c64xplus.pl diff --git a/Configure b/Configure index ba0be3162..791828776 100755 --- a/Configure +++ b/Configure @@ -414,7 +414,7 @@ my %table=( "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}", # # TI_CGT_C6000_7.3.x is a requirement -"linux-c64xplus","cl6x:--linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true", +"linux-c64xplus","cl6x:--linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o::rc4-c64xplus.o:::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true", # Android: linux-* but without -DTERMIO and pointers to headers and libs. "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", diff --git a/TABLE b/TABLE index 8a46fb77c..bf2dba3b8 100644 --- a/TABLE +++ b/TABLE @@ -1652,7 +1652,7 @@ $multilib = *** debug-VC-WIN32 $cc = cl -$cflags = -W3 -WX -Gs0 -GF -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE +$cflags = -W3 -Gs0 -GF -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE $unistd = $thread_cflag = $sys_id = WIN32 @@ -4174,7 +4174,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o $cast_obj = -$rc4_obj = +$rc4_obj = rc4-c64xplus.o $rmd160_obj = $rc5_obj = $wp_obj = diff --git a/crypto/rc4/asm/rc4-c64xplus.pl b/crypto/rc4/asm/rc4-c64xplus.pl new file mode 100644 index 000000000..6e5fe0549 --- /dev/null +++ b/crypto/rc4/asm/rc4-c64xplus.pl @@ -0,0 +1,183 @@ +#!/usr/bin/env perl +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# RC4 for C64x+. +# +# April 2014 +# +# RC4 subroutine processes one byte in 7.0 cycles, which is 3x faster +# than TI CGT-generated code. Loop is scheduled in such way that +# there is only one reference to memory in each cycle. This is done +# to avoid L1D memory banking conflicts, see SPRU871 TI publication +# for further details. Otherwise it should be possible to schedule +# the loop for iteration interval of 6... + +($KEY,$LEN,$INP,$OUT)=("A4","B4","A6","B6"); + +($KEYA,$XX,$TY,$xx,$ONE,$ret)=map("A$_",(5,7,8,9,1,2)); +($KEYB,$YY,$TX,$tx,$SUM,$dat)=map("B$_",(5,7,8,9,1,2)); + +$code.=<<___; + .text + + .if .ASSEMBLER_VERSION<7000000 + .asg 0,__TI_EABI__ + .endif + .if __TI_EABI__ + .nocmp + .asg RC4,_RC4 + .asg RC4_set_key,_RC4_set_key + .asg RC4_options,_RC4_options + .endif + + .global _RC4 + .align 16 +_RC4: + .asmfunc + MV $LEN,B0 + [!B0] BNOP B3 ; if (len==0) return; +||[B0] ADD $KEY,2,$KEYA +||[B0] ADD $KEY,2,$KEYB + [B0] MVK 1,$ONE +||[B0] LDBU *${KEYA}[-2],$XX ; key->x + [B0] LDBU *${KEYB}[-1],$YY ; key->y +|| NOP 4 + + ADD4 $ONE,$XX,$XX + LDBU *${KEYA}[$XX],$TX +|| MVC $LEN,ILC + NOP 4 +;;================================================== + SPLOOP 7 +|| ADD4 $TX,$YY,$YY + + LDBU *${KEYB}[$YY],$TY +|| MVD $XX,$xx +|| ADD4 $ONE,$XX,$XX + LDBU *${KEYA}[$XX],$tx + CMPEQ $YY,$XX,B0 +|| NOP 3 + STB $TX,*${KEYB}[$YY] +||[B0] ADD4 $TX,$YY,$YY + STB $TY,*${KEYA}[$xx] +||[!B0] ADD4 $tx,$YY,$YY +||[!B0] MVD $tx,$TX + ADD4 $TY,$TX,$SUM ; [0,0] $TX is not replaced by $tx yet! +|| NOP 2 + LDBU *$INP++,$dat +|| NOP 2 + LDBU *${KEYB}[$SUM],$ret +|| NOP 5 + XOR.L $dat,$ret,$ret + SPKERNEL +|| STB $ret,*$OUT++ +;;================================================== + SUB4 $XX,$ONE,$XX +|| NOP 5 + STB $XX,*${KEYA}[-2] ; key->x +|| SUB4 $YY,$TX,$YY +|| BNOP B3 + STB $YY,*${KEYB}[-1] ; key->y +|| NOP 5 + .endasmfunc + + .global _RC4_set_key + .align 16 +_RC4_set_key: + .asmfunc + .if .BIG_ENDIAN + MVK 0x00000404,$ONE +|| MVK 0x00000203,B0 + MVKH 0x04040000,$ONE +|| MVKH 0x00010000,B0 + .else + MVK 0x00000404,$ONE +|| MVK 0x00000100,B0 + MVKH 0x04040000,$ONE +|| MVKH 0x03020000,B0 + .endif + ADD $KEY,2,$KEYA +|| ADD $KEY,2,$KEYB +|| ADD $INP,$LEN,$ret ; end of input + LDBU *${INP}++,$dat +|| MVK 0,$TX + STH $TX,*${KEY}++ ; key->x=key->y=0 +|| MV B0,A0 +|| MVK 64-4,B0 + +;;================================================== + SPLOOPD 1 +|| MVC B0,ILC + + STNW A0,*${KEY}++ +|| ADD4 $ONE,A0,A0 + SPKERNEL +;;================================================== + + MVK 0,$YY +|| MVK 0,$XX + MVK 1,$ONE +|| MVK 256-1,B0 + +;;================================================== + SPLOOPD 8 +|| MVC B0,ILC + + ADD4 $dat,$YY,$YY +|| CMPEQ $INP,$ret,A0 ; end of input? + LDBU *${KEYB}[$YY],$TY +|| MVD $XX,$xx +|| ADD4 $ONE,$XX,$XX + LDBU *${KEYA}[$XX],$tx +||[A0] SUB $INP,$LEN,$INP ; rewind + LDBU *${INP}++,$dat +|| CMPEQ $YY,$XX,B0 +|| NOP 3 + STB $TX,*${KEYB}[$YY] +||[B0] ADD4 $TX,$YY,$YY + STB $TY,*${KEYA}[$xx] +||[!B0] ADD4 $tx,$YY,$YY +||[!B0] MV $tx,$TX + SPKERNEL +;;================================================== + + BNOP B3,5 + .endasmfunc + + .global _RC4_options + .align 16 +_RC4_options: +_rc4_options: + .asmfunc + BNOP B3,1 + ADDKPC _rc4_options,B4 + .if __TI_EABI__ + MVKL \$PCR_OFFSET(rc4_options,_rc4_options),A4 + MVKH \$PCR_OFFSET(rc4_options,_rc4_options),A4 + .else + MVKL (rc4_options-_rc4_options),A4 + MVKH (rc4_options-_rc4_options),A4 + .endif + ADD B4,A4,A4 + .endasmfunc + + .if __TI_EABI__ + .sect ".text:rc4_options.const" + .else + .sect ".const:rc4_options" + .endif + .align 4 +rc4_options: + .cstring "rc4(sploop,char)" + .cstring "RC4 for C64+, CRYPTOGAMS by " + .align 4 +___ + +print $code; +close STDOUT;