3e181369dd
tested, because kernel is not in shape to handle it *yet*. The code is committed mostly to stimulate the kernel development.
324 lines
7.6 KiB
Perl
324 lines
7.6 KiB
Perl
#!/usr/bin/env perl
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# SHA1 for C64x+.
|
|
#
|
|
# November 2011
|
|
#
|
|
# If compared to compiler-generated code with similar characteristics,
|
|
# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
|
|
# this implementation is 25% smaller and >2x faster. In absolute terms
|
|
# performance is (quite impressive) ~6.5 cycles per processed byte.
|
|
# Fully unrolled assembler would be ~5x larger and is likely to be
|
|
# ~15% faster. It would be free from references to intermediate ring
|
|
# buffer, but put more pressure on L1P [both because the code would be
|
|
# larger and won't be using SPLOOP buffer]. There are no plans to
|
|
# realize fully unrolled variant though...
|
|
#
|
|
# !!! Note that this module uses AMR, which means that all interrupt
|
|
# service routines are expected to preserve it and for own well-being
|
|
# zero it upon entry.
|
|
|
|
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
|
open STDOUT,">$output";
|
|
|
|
($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments
|
|
|
|
($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
|
|
($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
|
|
($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
|
|
($XPA,$XPB) = ("A5","B5"); # X circular buffer
|
|
($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
|
|
|
|
$code=<<___;
|
|
.text
|
|
|
|
.asg B3,RA
|
|
.asg A15,FP
|
|
.asg B15,SP
|
|
|
|
.if .BIG_ENDIAN
|
|
.asg MV,SWAP2
|
|
.asg MV,SWAP4
|
|
.endif
|
|
|
|
.global _sha1_block_data_order
|
|
_sha1_block_data_order:
|
|
.asmfunc stack_usage(64)
|
|
MV $NUM,A0 ; reassign $NUM
|
|
|| MVK -64,B0
|
|
[!A0] BNOP RA ; if ($NUM==0) return;
|
|
|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64)
|
|
|| [A0] MV SP,FP
|
|
[A0] LDW *${CTX}[0],$A ; load A-E...
|
|
|| [A0] AND B0,SP,SP ; align stack at 64 bytes
|
|
[A0] LDW *${CTX}[1],$B
|
|
|| [A0] SUBAW SP,2,SP ; reserve two words above buffer
|
|
[A0] LDW *${CTX}[2],$C
|
|
|| [A0] MVK 0x00404,B0
|
|
[A0] LDW *${CTX}[3],$D
|
|
|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB]
|
|
[A0] LDW *${CTX}[4],$E
|
|
|| [A0] MVC B0,AMR ; setup circular addressing
|
|
LDNW *${INP}++,$TX1 ; pre-fetch input
|
|
NOP 1
|
|
|
|
loop?:
|
|
MVK 0x00007999,$K
|
|
|| ADDAW SP,2,$XPA
|
|
|| SUB A0,1,A0
|
|
|| MVK 13,B0
|
|
MVKH 0x5a820000,$K ; K_00_19
|
|
|| ADDAW SP,2,$XPB
|
|
|| MV $A,$Actx
|
|
|| MV $B,$Bctx
|
|
;;==================================================
|
|
SPLOOPD 5 ; BODY_00_13
|
|
|| MV $C,$Cctx
|
|
|| MV $D,$Dctx
|
|
|| MV $E,$Ectx
|
|
|| MVC B0,ILC
|
|
|
|
ROTL $A,5,$Arot
|
|
|| AND $C,$B,$F
|
|
|| ANDN $D,$B,$F0
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|
|
XOR $F0,$F,$F ; F_00_19(B,C,D)
|
|
|| MV $D,$E ; E=D
|
|
|| MV $C,$D ; D=C
|
|
|| SWAP2 $TX1,$TX2
|
|
|| LDNW *${INP}++,$TX1
|
|
|
|
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|| SWAP4 $TX2,$TX3 ; byte swap
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| MV $A,$B ; B=A
|
|
|
|
ADD $TX3,$T,$A ; A=T+Xi
|
|
|| STW $TX3,*${XPB}++
|
|
SPKERNEL
|
|
;;==================================================
|
|
ROTL $A,5,$Arot ; BODY_14
|
|
|| AND $C,$B,$F
|
|
|| ANDN $D,$B,$F0
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|
|
XOR $F0,$F,$F ; F_00_19(B,C,D)
|
|
|| MV $D,$E ; E=D
|
|
|| MV $C,$D ; D=C
|
|
|| SWAP2 $TX1,$TX2
|
|
|| LDNW *${INP}++,$TX1
|
|
|
|
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|| SWAP4 $TX2,$TX2 ; byte swap
|
|
|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are
|
|
|| LDW *${XPB}[4],$X2 ; 2 iterations ahead
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| MV $A,$B ; B=A
|
|
|| LDW *${XPA}[7],$X8
|
|
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
|
|| MV $TX2,$TX3
|
|
|
|
ADD $TX2,$T,$A ; A=T+Xi
|
|
|| STW $TX2,*${XPB}++
|
|
;;==================================================
|
|
ROTL $A,5,$Arot ; BODY_15
|
|
|| AND $C,$B,$F
|
|
|| ANDN $D,$B,$F0
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|
|
XOR $F0,$F,$F ; F_00_19(B,C,D)
|
|
|| MV $D,$E ; E=D
|
|
|| MV $C,$D ; D=C
|
|
|| SWAP2 $TX1,$TX2
|
|
|
|
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|| SWAP4 $TX2,$TX2 ; byte swap
|
|
|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead
|
|
|| LDW *${XPA}++,$X0
|
|
|| LDW *${XPB}[4],$X2
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| MV $A,$B ; B=A
|
|
|| XOR $X8,$X13,$TX1
|
|
|| LDW *${XPA}[7],$X8
|
|
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
|
|| MV $TX2,$TX3
|
|
|
|
ADD $TX2,$T,$A ; A=T+Xi
|
|
|| STW $TX2,*${XPB}++
|
|
|| XOR $TX0,$TX1,$TX1
|
|
|| MVK 3,B0
|
|
;;==================================================
|
|
SPLOOPD 5 ; BODY_16_19
|
|
|| MVC B0,ILC
|
|
|
|
ROTL $A,5,$Arot
|
|
|| AND $C,$B,$F
|
|
|| ANDN $D,$B,$F0
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|| ROTL $TX1,1,$TX2 ; Xupdate output
|
|
|
|
XOR $F0,$F,$F ; F_00_19(B,C,D)
|
|
|| MV $D,$E ; E=D
|
|
|| MV $C,$D ; D=C
|
|
|
|
ADD $F,$T,$T ; T+=F_00_19(B,C,D)
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|| XOR $X0,$X2,$TX0
|
|
|| LDW *${XPA}++,$X0
|
|
|| LDW *${XPB}[4],$X2
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| MV $A,$B ; B=A
|
|
|| XOR $X8,$X13,$TX1
|
|
|| LDW *${XPA}[7],$X8
|
|
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
|
|| MV $TX2,$TX3
|
|
|
|
ADD $TX2,$T,$A ; A=T+Xi
|
|
|| STW $TX2,*${XPB}++
|
|
|| XOR $TX0,$TX1,$TX1
|
|
SPKERNEL
|
|
|
|
MVK 0xffffeba1,$K
|
|
|| MVK 19,B0
|
|
MVKH 0x6ed90000,$K ; K_20_39
|
|
___
|
|
sub BODY_20_39 {
|
|
$code.=<<___;
|
|
;;==================================================
|
|
SPLOOPD 5 ; BODY_20_39
|
|
|| MVC B0,ILC
|
|
|
|
ROTL $A,5,$Arot
|
|
|| XOR $B,$C,$F
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|| ROTL $TX1,1,$TX2 ; Xupdate output
|
|
|
|
XOR $D,$F,$F ; F_20_39(B,C,D)
|
|
|| MV $D,$E ; E=D
|
|
|| MV $C,$D ; D=C
|
|
|
|
ADD $F,$T,$T ; T+=F_20_39(B,C,D)
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|| XOR $X0,$X2,$TX0
|
|
|| LDW *${XPA}++,$X0
|
|
|| LDW *${XPB}[4],$X2
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| MV $A,$B ; B=A
|
|
|| XOR $X8,$X13,$TX1
|
|
|| LDW *${XPA}[7],$X8
|
|
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
|
|| MV $TX2,$TX3
|
|
|
|
ADD $TX2,$T,$A ; A=T+Xi
|
|
|| STW $TX2,*${XPB}++ ; last one is redundant
|
|
|| XOR $TX0,$TX1,$TX1
|
|
SPKERNEL
|
|
___
|
|
$code.=<<___ if (!shift);
|
|
MVK 0xffffbcdc,$K
|
|
MVKH 0x8f1b0000,$K ; K_40_59
|
|
___
|
|
} &BODY_20_39();
|
|
$code.=<<___;
|
|
;;==================================================
|
|
SPLOOPD 5 ; BODY_40_59
|
|
|| MVC B0,ILC
|
|
|| AND $B,$C,$F
|
|
|| AND $B,$D,$F0
|
|
|
|
ROTL $A,5,$Arot
|
|
|| XOR $F0,$F,$F
|
|
|| AND $C,$D,$F0
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|| ROTL $TX1,1,$TX2 ; Xupdate output
|
|
|
|
XOR $F0,$F,$F ; F_40_59(B,C,D)
|
|
|| MV $D,$E ; E=D
|
|
|| MV $C,$D ; D=C
|
|
|
|
ADD $F,$T,$T ; T+=F_40_59(B,C,D)
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|| XOR $X0,$X2,$TX0
|
|
|| LDW *${XPA}++,$X0
|
|
|| LDW *${XPB}[4],$X2
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| MV $A,$B ; B=A
|
|
|| XOR $X8,$X13,$TX1
|
|
|| LDW *${XPA}[7],$X8
|
|
|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13
|
|
|| MV $TX2,$TX3
|
|
|
|
ADD $TX2,$T,$A ; A=T+Xi
|
|
|| STW $TX2,*${XPB}++
|
|
|| XOR $TX0,$TX1,$TX1
|
|
|| AND $B,$C,$F
|
|
|| AND $B,$D,$F0
|
|
SPKERNEL
|
|
|
|
MVK 0xffffc1d6,$K
|
|
|| MVK 18,B0
|
|
MVKH 0xca620000,$K ; K_60_79
|
|
___
|
|
&BODY_20_39(-1); # BODY_60_78
|
|
$code.=<<___;
|
|
;;==================================================
|
|
[A0] B loop?
|
|
|| ROTL $A,5,$Arot ; BODY_79
|
|
|| XOR $B,$C,$F
|
|
|| ROTL $TX1,1,$TX2 ; Xupdate output
|
|
|
|
[A0] LDNW *${INP}++,$TX1 ; pre-fetch input
|
|
|| ADD $K,$E,$T ; T=E+K
|
|
|| XOR $D,$F,$F ; F_20_39(B,C,D)
|
|
|
|
ADD $F,$T,$T ; T+=F_20_39(B,C,D)
|
|
|| ADD $Ectx,$D,$E ; E=D,E+=Ectx
|
|
|| ADD $Dctx,$C,$D ; D=C,D+=Dctx
|
|
|| ROTL $B,30,$C ; C=ROL(B,30)
|
|
|
|
ADD $Arot,$T,$T ; T+=ROL(A,5)
|
|
|| ADD $Bctx,$A,$B ; B=A,B+=Bctx
|
|
|
|
ADD $TX2,$T,$A ; A=T+Xi
|
|
|
|
ADD $Actx,$A,$A ; A+=Actx
|
|
|| ADD $Cctx,$C,$C ; C+=Cctx
|
|
;; end of loop?
|
|
|
|
BNOP RA ; return
|
|
|| MV FP,SP ; restore stack pointer
|
|
|| LDW *FP[0],FP ; restore frame pointer
|
|
STW $A,*${CTX}[0] ; emit A-E...
|
|
|| MVK 0,B0
|
|
STW $B,*${CTX}[1]
|
|
|| MVC B0,AMR ; clear AMR
|
|
STW $C,*${CTX}[2]
|
|
STW $D,*${CTX}[3]
|
|
STW $E,*${CTX}[4]
|
|
.endasmfunc
|
|
|
|
.sect .const
|
|
.cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
|
|
.align 4
|
|
___
|
|
|
|
print $code;
|
|
close STDOUT;
|