bn/asm/armv4-gf2m.pl, modes/asm/ghash-armv4.pl: faster multiplication

algorithm suggested in following paper:

Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
Polynomial Multiplication on ARM Processors using the NEON Engine.

http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
This commit is contained in:
Andy Polyakov
2014-04-24 10:16:58 +02:00
parent 558ff0f0c1
commit f8cee9d081
3 changed files with 211 additions and 145 deletions

View File

@@ -20,14 +20,21 @@
# length, more for longer keys. Even though NEON 1x1 multiplication
# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
#
# April 2014
#
# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
# referred below, which improves ECDH and ECDSA verify benchmarks
# by 18-40%.
#
# C<>mara, D.; Gouv<75>a, C. P. L.; L<>pez, J. & Dahab, R.: Fast Software
# Polynomial Multiplication on ARM Processors using the NEON Engine.
#
# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
$code=<<___;
#include "arm_arch.h"
@@ -36,31 +43,6 @@ $code=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.type mul_1x1_neon,%function
.align 5
mul_1x1_neon:
vshl.u64 `&Dlo("q1")`,d16,#8 @ q1-q3 are slided $a
vmull.p8 `&Q("d0")`,d16,d17 @ a<>bb
vshl.u64 `&Dlo("q2")`,d16,#16
vmull.p8 q1,`&Dlo("q1")`,d17 @ a<<8<>bb
vshl.u64 `&Dlo("q3")`,d16,#24
vmull.p8 q2,`&Dlo("q2")`,d17 @ a<<16<31>bb
vshr.u64 `&Dlo("q1")`,#8
vmull.p8 q3,`&Dlo("q3")`,d17 @ a<<24<32>bb
vshl.u64 `&Dhi("q1")`,#24
veor d0,`&Dlo("q1")`
vshr.u64 `&Dlo("q2")`,#16
veor d0,`&Dhi("q1")`
vshl.u64 `&Dhi("q2")`,#16
veor d0,`&Dlo("q2")`
vshr.u64 `&Dlo("q3")`,#24
veor d0,`&Dhi("q2")`
vshl.u64 `&Dhi("q3")`,#8
veor d0,`&Dlo("q3")`
veor d0,`&Dhi("q3")`
bx lr
.size mul_1x1_neon,.-mul_1x1_neon
#endif
___
################
@@ -159,8 +141,9 @@ ___
# void bn_GF2m_mul_2x2(BN_ULONG *r,
# BN_ULONG a1,BN_ULONG a0,
# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0<61>b1b0
($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
{
my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
$code.=<<___;
.global bn_GF2m_mul_2x2
@@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:
tst r12,#1
beq .Lialu
veor $A1,$A1
vmov.32 $B1,r3,r3 @ two copies of b1
vmov.32 ${A1}[0],r1 @ a1
ldr r12, [sp] @ 5th argument
vmov.32 $a, r2, r1
vmov.32 $b, r12, r3
vmov.i64 $k48, #0x0000ffffffffffff
vmov.i64 $k32, #0x00000000ffffffff
vmov.i64 $k16, #0x000000000000ffff
veor $A0,$A0
vld1.32 ${B0}[],[sp,:32] @ two copies of b0
vmov.32 ${A0}[0],r2 @ a0
mov r12,lr
vext.8 $t0#lo, $a, $a, #1 @ A1
vmull.p8 $t0, $t0#lo, $b @ F = A1*B
vext.8 $r#lo, $b, $b, #1 @ B1
vmull.p8 $r, $a, $r#lo @ E = A*B1
vext.8 $t1#lo, $a, $a, #2 @ A2
vmull.p8 $t1, $t1#lo, $b @ H = A2*B
vext.8 $t3#lo, $b, $b, #2 @ B2
vmull.p8 $t3, $a, $t3#lo @ G = A*B2
vext.8 $t2#lo, $a, $a, #3 @ A3
veor $t0, $t0, $r @ L = E + F
vmull.p8 $t2, $t2#lo, $b @ J = A3*B
vext.8 $r#lo, $b, $b, #3 @ B3
veor $t1, $t1, $t3 @ M = G + H
vmull.p8 $r, $a, $r#lo @ I = A*B3
veor $t0#lo, $t0#lo, $t0#hi @ t0 = (L) (P0 + P1) << 8
vand $t0#hi, $t0#hi, $k48
vext.8 $t3#lo, $b, $b, #4 @ B4
veor $t1#lo, $t1#lo, $t1#hi @ t1 = (M) (P2 + P3) << 16
vand $t1#hi, $t1#hi, $k32
vmull.p8 $t3, $a, $t3#lo @ K = A*B4
veor $t2, $t2, $r @ N = I + J
veor $t0#lo, $t0#lo, $t0#hi
veor $t1#lo, $t1#lo, $t1#hi
veor $t2#lo, $t2#lo, $t2#hi @ t2 = (N) (P4 + P5) << 24
vand $t2#hi, $t2#hi, $k16
vext.8 $t0, $t0, $t0, #15
veor $t3#lo, $t3#lo, $t3#hi @ t3 = (K) (P6 + P7) << 32
vmov.i64 $t3#hi, #0
vext.8 $t1, $t1, $t1, #14
veor $t2#lo, $t2#lo, $t2#hi
vmull.p8 $r, $a, $b @ D = A*B
vext.8 $t3, $t3, $t3, #12
vext.8 $t2, $t2, $t2, #13
veor $t0, $t0, $t1
veor $t2, $t2, $t3
veor $r, $r, $t0
veor $r, $r, $t2
vmov d16,$A1
vmov d17,$B1
bl mul_1x1_neon @ a1<61>b1
vmov $A1B1,d0
vmov d16,$A0
vmov d17,$B0
bl mul_1x1_neon @ a0<61>b0
vmov $A0B0,d0
veor d16,$A0,$A1
veor d17,$B0,$B1
veor $A0,$A0B0,$A1B1
bl mul_1x1_neon @ (a0+a1)<29>(b0+b1)
veor d0,$A0 @ (a0+a1)<29>(b0+b1)-a0<61>b0-a1<61>b1
vshl.u64 d1,d0,#32
vshr.u64 d0,d0,#32
veor $A0B0,d1
veor $A1B1,d0
vst1.32 {${A0B0}[0]},[r0,:32]!
vst1.32 {${A0B0}[1]},[r0,:32]!
vst1.32 {${A1B1}[0]},[r0,:32]!
vst1.32 {${A1B1}[1]},[r0,:32]
bx r12
vst1.32 {$r}, [r0]
bx lr
.align 4
.Lialu:
#endif
___
}
$ret="r10"; # reassigned 1st argument
$code.=<<___;
stmdb sp!,{r4-r10,lr}
@@ -272,7 +269,12 @@ $code.=<<___;
.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
print $code;
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
print $_,"\n";
}
close STDOUT; # enforce flush