371 lines
9.5 KiB
NASM
371 lines
9.5 KiB
NASM
;;====================================================================
|
|
;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
;; project.
|
|
;;
|
|
;; Rights for redistribution and usage in source and binary forms are
|
|
;; granted according to the OpenSSL license. Warranty of any kind is
|
|
;; disclaimed.
|
|
;;====================================================================
|
|
;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
|
|
;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
|
|
;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
|
|
;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
|
|
;;====================================================================
|
|
.text
|
|
.if __TI_EABI__
|
|
.asg bn_mul_add_words,_bn_mul_add_words
|
|
.asg bn_mul_words,_bn_mul_words
|
|
.asg bn_sqr_words,_bn_sqr_words
|
|
.asg bn_add_words,_bn_add_words
|
|
.asg bn_sub_words,_bn_sub_words
|
|
.asg bn_div_words,_bn_div_words
|
|
.asg bn_sqr_comba8,_bn_sqr_comba8
|
|
.asg bn_mul_comba8,_bn_mul_comba8
|
|
.asg bn_sqr_comba4,_bn_sqr_comba4
|
|
.asg bn_mul_comba4,_bn_mul_comba4
|
|
.endif
|
|
|
|
.asg B3,RA
|
|
.asg A4,ARG0
|
|
.asg B4,ARG1
|
|
.asg A6,ARG2
|
|
.asg B6,ARG3
|
|
.asg A8,ARG4
|
|
.asg B8,ARG5
|
|
.asg A4,RET
|
|
.asg A15,FP
|
|
.asg B14,DP
|
|
.asg B15,SP
|
|
|
|
.global _bn_mul_add_words
|
|
_bn_mul_add_words:
|
|
.asmfunc
|
|
MV ARG2,B0
|
|
[!B0] BNOP RA
|
|
||[!B0] MVK 0,RET
|
|
[B0] MVC B0,ILC
|
|
[B0] ZERO A19 ; high part of accumulator
|
|
|| [B0] MV ARG0,A2
|
|
|| [B0] MV ARG3,A3
|
|
NOP 3
|
|
|
|
SPLOOP 2 ; 2*n+10
|
|
;;====================================================================
|
|
LDW *ARG1++,B7 ; ap[i]
|
|
NOP 3
|
|
LDW *ARG0++,A7 ; rp[i]
|
|
MPY32U B7,A3,A17:A16
|
|
NOP 3 ; [2,0] in epilogue
|
|
ADDU A16,A7,A21:A20
|
|
ADDU A19,A21:A20,A19:A18
|
|
|| MV.S A17,A23
|
|
SPKERNEL 2,1 ; leave slot for "return value"
|
|
|| STW A18,*A2++ ; rp[i]
|
|
|| ADD A19,A23,A19
|
|
;;====================================================================
|
|
BNOP RA,4
|
|
MV A19,RET ; return value
|
|
.endasmfunc
|
|
|
|
.global _bn_mul_words
|
|
_bn_mul_words:
|
|
.asmfunc
|
|
MV ARG2,B0
|
|
[!B0] BNOP RA
|
|
||[!B0] MVK 0,RET
|
|
[B0] MVC B0,ILC
|
|
[B0] ZERO A19 ; high part of accumulator
|
|
NOP 3
|
|
|
|
SPLOOP 2 ; 2*n+10
|
|
;;====================================================================
|
|
LDW *ARG1++,A7 ; ap[i]
|
|
NOP 4
|
|
MPY32U A7,ARG3,A17:A16
|
|
NOP 4 ; [2,0] in epiloque
|
|
ADDU A19,A16,A19:A18
|
|
|| MV.S A17,A21
|
|
SPKERNEL 2,1 ; leave slot for "return value"
|
|
|| STW A18,*ARG0++ ; rp[i]
|
|
|| ADD.L A19,A21,A19
|
|
;;====================================================================
|
|
BNOP RA,4
|
|
MV A19,RET ; return value
|
|
.endasmfunc
|
|
|
|
.global _bn_sqr_words
|
|
_bn_sqr_words:
|
|
.asmfunc
|
|
MV ARG2,B0
|
|
[!B0] BNOP RA
|
|
||[!B0] MVK 0,RET
|
|
[B0] MVC B0,ILC
|
|
[B0] MV ARG0,B2
|
|
|| [B0] ADD 4,ARG0,ARG0
|
|
NOP 3
|
|
|
|
SPLOOP 2 ; 2*n+10
|
|
;;====================================================================
|
|
LDW *ARG1++,B7 ; ap[i]
|
|
NOP 4
|
|
MPY32U B7,B7,B1:B0
|
|
NOP 3 ; [2,0] in epilogue
|
|
STW B0,*B2++(8) ; rp[2*i]
|
|
MV B1,A1
|
|
SPKERNEL 2,0 ; fully overlap BNOP RA,5
|
|
|| STW A1,*ARG0++(8) ; rp[2*i+1]
|
|
;;====================================================================
|
|
BNOP RA,5
|
|
.endasmfunc
|
|
|
|
.global _bn_add_words
|
|
_bn_add_words:
|
|
.asmfunc
|
|
MV ARG3,B0
|
|
[!B0] BNOP RA
|
|
||[!B0] MVK 0,RET
|
|
[B0] MVC B0,ILC
|
|
[B0] ZERO A1 ; carry flag
|
|
|| [B0] MV ARG0,A3
|
|
NOP 3
|
|
|
|
SPLOOP 2 ; 2*n+6
|
|
;;====================================================================
|
|
LDW *ARG2++,A7 ; bp[i]
|
|
|| LDW *ARG1++,B7 ; ap[i]
|
|
NOP 4
|
|
ADDU A7,B7,A9:A8
|
|
ADDU A1,A9:A8,A1:A0
|
|
SPKERNEL 0,0 ; fully overlap BNOP RA,5
|
|
|| STW A0,*A3++ ; write result
|
|
|| MV A1,RET ; keep carry flag in RET
|
|
;;====================================================================
|
|
BNOP RA,5
|
|
.endasmfunc
|
|
|
|
.global _bn_sub_words
|
|
_bn_sub_words:
|
|
.asmfunc
|
|
MV ARG3,B0
|
|
[!B0] BNOP RA
|
|
||[!B0] MVK 0,RET
|
|
[B0] MVC B0,ILC
|
|
[B0] ZERO A2 ; borrow flag
|
|
|| [B0] MV ARG0,A3
|
|
NOP 3
|
|
|
|
SPLOOP 2 ; 2*n+6
|
|
;;====================================================================
|
|
LDW *ARG2++,A7 ; bp[i]
|
|
|| LDW *ARG1++,B7 ; ap[i]
|
|
NOP 4
|
|
SUBU B7,A7,A1:A0
|
|
[A2] SUB A1:A0,1,A1:A0
|
|
SPKERNEL 0,1 ; leave slot for "return borrow flag"
|
|
|| STW A0,*A3++ ; write result
|
|
|| AND 1,A1,A2 ; pass on borrow flag
|
|
;;====================================================================
|
|
BNOP RA,4
|
|
AND 1,A1,RET ; return borrow flag
|
|
.endasmfunc
|
|
|
|
.global _bn_div_words
|
|
_bn_div_words:
|
|
.asmfunc
|
|
LMBD 1,A6,A0 ; leading zero bits in dv
|
|
LMBD 1,A4,A1 ; leading zero bits in hi
|
|
|| MVK 32,B0
|
|
CMPLTU A1,A0,A2
|
|
|| ADD A0,B0,B0
|
|
[ A2] BNOP RA
|
|
||[ A2] MVK -1,A4 ; return overflow
|
|
||[!A2] MV A4,A3 ; reassign hi
|
|
[!A2] MV B4,A4 ; reassign lo, will be quotient
|
|
||[!A2] MVC B0,ILC
|
|
[!A2] SHL A6,A0,A6 ; normalize dv
|
|
|| MVK 1,A1
|
|
|
|
[!A2] CMPLTU A3,A6,A1 ; hi<dv?
|
|
||[!A2] SHL A4,1,A5:A4 ; lo<<1
|
|
[!A1] SUB A3,A6,A3 ; hi-=dv
|
|
||[!A1] OR 1,A4,A4
|
|
[!A2] SHRU A3,31,A1 ; upper bit
|
|
||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
|
|
|
|
SPLOOP 3
|
|
[!A1] CMPLTU A3,A6,A1 ; hi<dv?
|
|
||[ A1] ZERO A1
|
|
|| SHL A4,1,A5:A4 ; lo<<1
|
|
[!A1] SUB A3,A6,A3 ; hi-=dv
|
|
||[!A1] OR 1,A4,A4 ; quotient
|
|
SHRU A3,31,A1 ; upper bit
|
|
|| ADDAH A5,A3,A3 ; hi<<1|lo>>31
|
|
SPKERNEL
|
|
|
|
BNOP RA,5
|
|
.endasmfunc
|
|
|
|
;;====================================================================
|
|
;; Not really Comba algorithm, just straightforward NxM... Dedicated
|
|
;; fully unrolled real Comba implementations are asymptotically 2x
|
|
;; faster, but naturally larger undertaking. Purpose of this exercise
|
|
;; was rather to learn to master nested SPLOOPs...
|
|
;;====================================================================
|
|
.global _bn_sqr_comba8
|
|
.global _bn_mul_comba8
|
|
_bn_sqr_comba8:
|
|
MV ARG1,ARG2
|
|
_bn_mul_comba8:
|
|
.asmfunc
|
|
MVK 8,B0 ; N, RILC
|
|
|| MVK 8,A0 ; M, outer loop counter
|
|
|| MV ARG1,A5 ; copy ap
|
|
|| MV ARG0,B4 ; copy rp
|
|
|| ZERO B19 ; high part of accumulator
|
|
MVC B0,RILC
|
|
|| SUB B0,2,B1 ; N-2, initial ILC
|
|
|| SUB B0,1,B2 ; const B2=N-1
|
|
|| LDW *A5++,B6 ; ap[0]
|
|
|| MV A0,A3 ; const A3=M
|
|
sploopNxM?: ; for best performance arrange M<=N
|
|
[A0] SPLOOPD 2 ; 2*n+10
|
|
|| MVC B1,ILC
|
|
|| ADDAW B4,B0,B5
|
|
|| ZERO B7
|
|
|| LDW *A5++,A9 ; pre-fetch ap[1]
|
|
|| ZERO A1
|
|
|| SUB A0,1,A0
|
|
;;====================================================================
|
|
;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
|
|
;; This is because of Advisory 15 from TI publication SPRZ247I.
|
|
LDW *ARG2++,A7 ; bp[i]
|
|
NOP 3
|
|
[A1] LDW *B5++,B7 ; rp[i]
|
|
MPY32U A7,B6,B17:B16
|
|
NOP 3
|
|
ADDU B16,B7,B21:B20
|
|
ADDU B19,B21:B20,B19:B18
|
|
|| MV.S B17,B23
|
|
SPKERNEL
|
|
|| STW B18,*B4++ ; rp[i]
|
|
|| ADD.S B19,B23,B19
|
|
;;====================================================================
|
|
outer?: ; m*2*(n+1)+10
|
|
SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
|
|
SPMASKR
|
|
|| CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
|
|
MVD A9,B6 ; move through .M unit(*)
|
|
[A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
|
|
SUBAW B5,B2,B5 ; rewind rp to rp[1]
|
|
MVK 1,A1
|
|
[A0] BNOP.S1 outer?,4
|
|
|| [A0] SUB.L A0,1,A0
|
|
STW B19,*B4--[B2] ; rewind rp tp rp[1]
|
|
|| ZERO.S B19 ; high part of accumulator
|
|
;; end of outer?
|
|
BNOP RA,5 ; return
|
|
.endasmfunc
|
|
;; (*) It should be noted that B6 is used as input to MPY32U in
|
|
;; chronologically next cycle in *preceding* SPLOOP iteration.
|
|
;; Normally such arrangement would require DINT, but at this
|
|
;; point SPLOOP is draining and interrupts are disabled
|
|
;; implicitly.
|
|
|
|
.global _bn_sqr_comba4
|
|
.global _bn_mul_comba4
|
|
_bn_sqr_comba4:
|
|
MV ARG1,ARG2
|
|
_bn_mul_comba4:
|
|
.asmfunc
|
|
.if 0
|
|
BNOP sploopNxM?,3
|
|
;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
|
|
;; because of read-after-write penalties, it's rather
|
|
;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
|
|
MVK 4,B0 ; N, RILC
|
|
|| MVK 4,A0 ; M, outer loop counter
|
|
|| MV ARG1,A5 ; copy ap
|
|
|| MV ARG0,B4 ; copy rp
|
|
|| ZERO B19 ; high part of accumulator
|
|
MVC B0,RILC
|
|
|| SUB B0,2,B1 ; first ILC
|
|
|| SUB B0,1,B2 ; const B2=N-1
|
|
|| LDW *A5++,B6 ; ap[0]
|
|
|| MV A0,A3 ; const A3=M
|
|
.else
|
|
;; This alternative is an exercise in fully unrolled Comba
|
|
;; algorithm implementation that operates at n*(n+1)+12, or
|
|
;; as little as 32 cycles...
|
|
LDW *ARG1[0],B16 ; a[0]
|
|
|| LDW *ARG2[0],A16 ; b[0]
|
|
LDW *ARG1[1],B17 ; a[1]
|
|
|| LDW *ARG2[1],A17 ; b[1]
|
|
LDW *ARG1[2],B18 ; a[2]
|
|
|| LDW *ARG2[2],A18 ; b[2]
|
|
LDW *ARG1[3],B19 ; a[3]
|
|
|| LDW *ARG2[3],A19 ; b[3]
|
|
NOP
|
|
MPY32U A16,B16,A1:A0 ; a[0]*b[0]
|
|
MPY32U A17,B16,A23:A22 ; a[0]*b[1]
|
|
MPY32U A16,B17,A25:A24 ; a[1]*b[0]
|
|
MPY32U A16,B18,A27:A26 ; a[2]*b[0]
|
|
STW A0,*ARG0[0]
|
|
|| MPY32U A17,B17,A29:A28 ; a[1]*b[1]
|
|
MPY32U A18,B16,A31:A30 ; a[0]*b[2]
|
|
|| ADDU A22,A1,A1:A0
|
|
MV A23,B0
|
|
|| MPY32U A19,B16,A21:A20 ; a[3]*b[0]
|
|
|| ADDU A24,A1:A0,A1:A0
|
|
ADDU A25,B0,B1:B0
|
|
|| STW A0,*ARG0[1]
|
|
|| MPY32U A18,B17,A23:A22 ; a[2]*b[1]
|
|
|| ADDU A26,A1,A9:A8
|
|
ADDU A27,B1,B9:B8
|
|
|| MPY32U A17,B18,A25:A24 ; a[1]*b[2]
|
|
|| ADDU A28,A9:A8,A9:A8
|
|
ADDU A29,B9:B8,B9:B8
|
|
|| MPY32U A16,B19,A27:A26 ; a[0]*b[3]
|
|
|| ADDU A30,A9:A8,A9:A8
|
|
ADDU A31,B9:B8,B9:B8
|
|
|| ADDU B0,A9:A8,A9:A8
|
|
STW A8,*ARG0[2]
|
|
|| ADDU A20,A9,A1:A0
|
|
ADDU A21,B9,B1:B0
|
|
|| MPY32U A19,B17,A21:A20 ; a[3]*b[1]
|
|
|| ADDU A22,A1:A0,A1:A0
|
|
ADDU A23,B1:B0,B1:B0
|
|
|| MPY32U A18,B18,A23:A22 ; a[2]*b[2]
|
|
|| ADDU A24,A1:A0,A1:A0
|
|
ADDU A25,B1:B0,B1:B0
|
|
|| MPY32U A17,B19,A25:A24 ; a[1]*b[3]
|
|
|| ADDU A26,A1:A0,A1:A0
|
|
ADDU A27,B1:B0,B1:B0
|
|
|| ADDU B8,A1:A0,A1:A0
|
|
STW A0,*ARG0[3]
|
|
|| MPY32U A19,B18,A27:A26 ; a[3]*b[2]
|
|
|| ADDU A20,A1,A9:A8
|
|
ADDU A21,B1,B9:B8
|
|
|| MPY32U A18,B19,A29:A28 ; a[2]*b[3]
|
|
|| ADDU A22,A9:A8,A9:A8
|
|
ADDU A23,B9:B8,B9:B8
|
|
|| MPY32U A19,B19,A31:A30 ; a[3]*b[3]
|
|
|| ADDU A24,A9:A8,A9:A8
|
|
ADDU A25,B9:B8,B9:B8
|
|
|| ADDU B0,A9:A8,A9:A8
|
|
STW A8,*ARG0[4]
|
|
|| ADDU A26,A9,A1:A0
|
|
ADDU A27,B9,B1:B0
|
|
|| ADDU A28,A1:A0,A1:A0
|
|
ADDU A29,B1:B0,B1:B0
|
|
|| BNOP RA
|
|
|| ADDU B8,A1:A0,A1:A0
|
|
STW A0,*ARG0[5]
|
|
|| ADDU A30,A1,A9:A8
|
|
ADD A31,B1,B8
|
|
ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
|
|
ADD B8,A9,A9
|
|
|| STW A8,*ARG0[6]
|
|
STW A9,*ARG0[7]
|
|
.endif
|
|
.endasmfunc
|