bn/asm/armv4-gf2m.pl, modes/asm/ghash-armv4.pl: faster multiplication
algorithm suggested in following paper: Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software Polynomial Multiplication on ARM Processors using the NEON Engine. http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
This commit is contained in:
@@ -20,14 +20,21 @@
|
||||
# length, more for longer keys. Even though NEON 1x1 multiplication
|
||||
# runs in even less cycles, ~30, improvement is measurable only on
|
||||
# longer keys. One has to optimize code elsewhere to get NEON glow...
|
||||
#
|
||||
# April 2014
|
||||
#
|
||||
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
|
||||
# referred below, which improves ECDH and ECDSA verify benchmarks
|
||||
# by 18-40%.
|
||||
#
|
||||
# C<>mara, D.; Gouv<75>a, C. P. L.; L<>pez, J. & Dahab, R.: Fast Software
|
||||
# Polynomial Multiplication on ARM Processors using the NEON Engine.
|
||||
#
|
||||
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
|
||||
|
||||
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
|
||||
open STDOUT,">$output";
|
||||
|
||||
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
|
||||
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
|
||||
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
|
||||
|
||||
$code=<<___;
|
||||
#include "arm_arch.h"
|
||||
|
||||
@@ -36,31 +43,6 @@ $code=<<___;
|
||||
|
||||
#if __ARM_ARCH__>=7
|
||||
.fpu neon
|
||||
|
||||
.type mul_1x1_neon,%function
|
||||
.align 5
|
||||
mul_1x1_neon:
|
||||
vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
|
||||
vmull.p8 `&Q("d0")`,d16,d17 @ a<>bb
|
||||
vshl.u64 `&Dlo("q2")`,d16,#16
|
||||
vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8<>bb
|
||||
vshl.u64 `&Dlo("q3")`,d16,#24
|
||||
vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16<31>bb
|
||||
vshr.u64 `&Dlo("q1")`,#8
|
||||
vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24<32>bb
|
||||
vshl.u64 `&Dhi("q1")`,#24
|
||||
veor d0,`&Dlo("q1")`
|
||||
vshr.u64 `&Dlo("q2")`,#16
|
||||
veor d0,`&Dhi("q1")`
|
||||
vshl.u64 `&Dhi("q2")`,#16
|
||||
veor d0,`&Dlo("q2")`
|
||||
vshr.u64 `&Dlo("q3")`,#24
|
||||
veor d0,`&Dhi("q2")`
|
||||
vshl.u64 `&Dhi("q3")`,#8
|
||||
veor d0,`&Dlo("q3")`
|
||||
veor d0,`&Dhi("q3")`
|
||||
bx lr
|
||||
.size mul_1x1_neon,.-mul_1x1_neon
|
||||
#endif
|
||||
___
|
||||
################
|
||||
@@ -159,8 +141,9 @@ ___
|
||||
# void bn_GF2m_mul_2x2(BN_ULONG *r,
|
||||
# BN_ULONG a1,BN_ULONG a0,
|
||||
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0<61>b1b0
|
||||
|
||||
($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
|
||||
{
|
||||
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
|
||||
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
|
||||
|
||||
$code.=<<___;
|
||||
.global bn_GF2m_mul_2x2
|
||||
@@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:
|
||||
tst r12,#1
|
||||
beq .Lialu
|
||||
|
||||
veor $A1,$A1
|
||||
vmov.32 $B1,r3,r3 @ two copies of b1
|
||||
vmov.32 ${A1}[0],r1 @ a1
|
||||
ldr r12, [sp] @ 5th argument
|
||||
vmov.32 $a, r2, r1
|
||||
vmov.32 $b, r12, r3
|
||||
vmov.i64 $k48, #0x0000ffffffffffff
|
||||
vmov.i64 $k32, #0x00000000ffffffff
|
||||
vmov.i64 $k16, #0x000000000000ffff
|
||||
|
||||
veor $A0,$A0
|
||||
vld1.32 ${B0}[],[sp,:32] @ two copies of b0
|
||||
vmov.32 ${A0}[0],r2 @ a0
|
||||
mov r12,lr
|
||||
vext.8 $t0#lo, $a, $a, #1 @ A1
|
||||
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
|
||||
vext.8 $r#lo, $b, $b, #1 @ B1
|
||||
vmull.p8 $r, $a, $r#lo @ E = A*B1
|
||||
vext.8 $t1#lo, $a, $a, #2 @ A2
|
||||
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
|
||||
vext.8 $t3#lo, $b, $b, #2 @ B2
|
||||
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
|
||||
vext.8 $t2#lo, $a, $a, #3 @ A3
|
||||
veor $t0, $t0, $r @ L = E + F
|
||||
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
|
||||
vext.8 $r#lo, $b, $b, #3 @ B3
|
||||
veor $t1, $t1, $t3 @ M = G + H
|
||||
vmull.p8 $r, $a, $r#lo @ I = A*B3
|
||||
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
|
||||
vand $t0#hi, $t0#hi, $k48
|
||||
vext.8 $t3#lo, $b, $b, #4 @ B4
|
||||
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
|
||||
vand $t1#hi, $t1#hi, $k32
|
||||
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
|
||||
veor $t2, $t2, $r @ N = I + J
|
||||
veor $t0#lo, $t0#lo, $t0#hi
|
||||
veor $t1#lo, $t1#lo, $t1#hi
|
||||
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
|
||||
vand $t2#hi, $t2#hi, $k16
|
||||
vext.8 $t0, $t0, $t0, #15
|
||||
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
|
||||
vmov.i64 $t3#hi, #0
|
||||
vext.8 $t1, $t1, $t1, #14
|
||||
veor $t2#lo, $t2#lo, $t2#hi
|
||||
vmull.p8 $r, $a, $b @ D = A*B
|
||||
vext.8 $t3, $t3, $t3, #12
|
||||
vext.8 $t2, $t2, $t2, #13
|
||||
veor $t0, $t0, $t1
|
||||
veor $t2, $t2, $t3
|
||||
veor $r, $r, $t0
|
||||
veor $r, $r, $t2
|
||||
|
||||
vmov d16,$A1
|
||||
vmov d17,$B1
|
||||
bl mul_1x1_neon @ a1<61>b1
|
||||
vmov $A1B1,d0
|
||||
|
||||
vmov d16,$A0
|
||||
vmov d17,$B0
|
||||
bl mul_1x1_neon @ a0<61>b0
|
||||
vmov $A0B0,d0
|
||||
|
||||
veor d16,$A0,$A1
|
||||
veor d17,$B0,$B1
|
||||
veor $A0,$A0B0,$A1B1
|
||||
bl mul_1x1_neon @ (a0+a1)<29>(b0+b1)
|
||||
|
||||
veor d0,$A0 @ (a0+a1)<29>(b0+b1)-a0<61>b0-a1<61>b1
|
||||
vshl.u64 d1,d0,#32
|
||||
vshr.u64 d0,d0,#32
|
||||
veor $A0B0,d1
|
||||
veor $A1B1,d0
|
||||
vst1.32 {${A0B0}[0]},[r0,:32]!
|
||||
vst1.32 {${A0B0}[1]},[r0,:32]!
|
||||
vst1.32 {${A1B1}[0]},[r0,:32]!
|
||||
vst1.32 {${A1B1}[1]},[r0,:32]
|
||||
bx r12
|
||||
vst1.32 {$r}, [r0]
|
||||
bx lr
|
||||
.align 4
|
||||
.Lialu:
|
||||
#endif
|
||||
___
|
||||
}
|
||||
$ret="r10"; # reassigned 1st argument
|
||||
$code.=<<___;
|
||||
stmdb sp!,{r4-r10,lr}
|
||||
@@ -272,7 +269,12 @@ $code.=<<___;
|
||||
.comm OPENSSL_armcap_P,4,4
|
||||
___
|
||||
|
||||
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
||||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||||
print $code;
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
|
||||
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
|
||||
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
close STDOUT; # enforce flush
|
||||
|
||||
Reference in New Issue
Block a user