
This is achieved by filtering perlasm output through arm-xlate.pl. But note that it's done only if "flavour" argument is not 'void'. As 'void' is default value for other ARM targets, permasm output is not actually filtered on previously validated platforms. Reviewed-by: Dr. Stephen Henson <steve@openssl.org>
217 lines
5.7 KiB
Prolog
217 lines
5.7 KiB
Prolog
#!/usr/bin/env perl
|
||
|
||
# ====================================================================
|
||
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
# ====================================================================
|
||
|
||
# January 2007.
|
||
|
||
# Montgomery multiplication for ARMv4.
|
||
#
|
||
# Performance improvement naturally varies among CPU implementations
|
||
# and compilers. The code was observed to provide +65-35% improvement
|
||
# [depending on key length, less for longer keys] on ARM920T, and
|
||
# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
|
||
# base and compiler generated code with in-lined umull and even umlal
|
||
# instructions. The latter means that this code didn't really have an
|
||
# "advantage" of utilizing some "secret" instruction.
|
||
#
|
||
# The code is interoperable with Thumb ISA and is rather compact, less
|
||
# than 1/2KB. Windows CE port would be trivial, as it's exclusively
|
||
# about decorations, ABI and instruction syntax are identical.
|
||
|
||
$flavour = shift;
|
||
if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
|
||
else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
|
||
|
||
if ($flavour && $flavour ne "void") {
|
||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
|
||
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
|
||
die "can't locate arm-xlate.pl";
|
||
|
||
open STDOUT,"| \"$^X\" $xlate $flavour $output";
|
||
} else {
|
||
open STDOUT,">$output";
|
||
}
|
||
|
||
$num="r0"; # starts as num argument, but holds &tp[num-1]
|
||
$ap="r1";
|
||
$bp="r2"; $bi="r2"; $rp="r2";
|
||
$np="r3";
|
||
$tp="r4";
|
||
$aj="r5";
|
||
$nj="r6";
|
||
$tj="r7";
|
||
$n0="r8";
|
||
########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
|
||
$alo="r10"; # sl, gcc uses it to keep @GOT
|
||
$ahi="r11"; # fp
|
||
$nlo="r12"; # ip
|
||
########### # r13 is stack pointer
|
||
$nhi="r14"; # lr
|
||
########### # r15 is program counter
|
||
|
||
#### argument block layout relative to &tp[num-1], a.k.a. $num
|
||
$_rp="$num,#12*4";
|
||
# ap permanently resides in r1
|
||
$_bp="$num,#13*4";
|
||
# np permanently resides in r3
|
||
$_n0="$num,#14*4";
|
||
$_num="$num,#15*4"; $_bpend=$_num;
|
||
|
||
$code=<<___;
|
||
.text
|
||
|
||
.global bn_mul_mont
|
||
.type bn_mul_mont,%function
|
||
|
||
.align 2
|
||
bn_mul_mont:
|
||
stmdb sp!,{r0,r2} @ sp points at argument block
|
||
ldr $num,[sp,#3*4] @ load num
|
||
cmp $num,#2
|
||
movlt r0,#0
|
||
addlt sp,sp,#2*4
|
||
blt .Labrt
|
||
|
||
stmdb sp!,{r4-r12,lr} @ save 10 registers
|
||
|
||
mov $num,$num,lsl#2 @ rescale $num for byte count
|
||
sub sp,sp,$num @ alloca(4*num)
|
||
sub sp,sp,#4 @ +extra dword
|
||
sub $num,$num,#4 @ "num=num-1"
|
||
add $tp,$bp,$num @ &bp[num-1]
|
||
|
||
add $num,sp,$num @ $num to point at &tp[num-1]
|
||
ldr $n0,[$_n0] @ &n0
|
||
ldr $bi,[$bp] @ bp[0]
|
||
ldr $aj,[$ap],#4 @ ap[0],ap++
|
||
ldr $nj,[$np],#4 @ np[0],np++
|
||
ldr $n0,[$n0] @ *n0
|
||
str $tp,[$_bpend] @ save &bp[num]
|
||
|
||
umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
|
||
str $n0,[$_n0] @ save n0 value
|
||
mul $n0,$alo,$n0 @ "tp[0]"*n0
|
||
mov $nlo,#0
|
||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
|
||
mov $tp,sp
|
||
|
||
.L1st:
|
||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||
mov $alo,$ahi
|
||
ldr $nj,[$np],#4 @ np[j],np++
|
||
mov $ahi,#0
|
||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
|
||
mov $nhi,#0
|
||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||
adds $nlo,$nlo,$alo
|
||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||
adc $nlo,$nhi,#0
|
||
cmp $tp,$num
|
||
bne .L1st
|
||
|
||
adds $nlo,$nlo,$ahi
|
||
ldr $tp,[$_bp] @ restore bp
|
||
mov $nhi,#0
|
||
ldr $n0,[$_n0] @ restore n0
|
||
adc $nhi,$nhi,#0
|
||
str $nlo,[$num] @ tp[num-1]=
|
||
str $nhi,[$num,#4] @ tp[num]=
|
||
|
||
.Louter:
|
||
sub $tj,$num,sp @ "original" $num-1 value
|
||
sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
|
||
ldr $bi,[$tp,#4]! @ *(++bp)
|
||
sub $np,$np,$tj @ "rewind" np to &np[1]
|
||
ldr $aj,[$ap,#-4] @ ap[0]
|
||
ldr $alo,[sp] @ tp[0]
|
||
ldr $nj,[$np,#-4] @ np[0]
|
||
ldr $tj,[sp,#4] @ tp[1]
|
||
|
||
mov $ahi,#0
|
||
umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
|
||
str $tp,[$_bp] @ save bp
|
||
mul $n0,$alo,$n0
|
||
mov $nlo,#0
|
||
umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
|
||
mov $tp,sp
|
||
|
||
.Linner:
|
||
ldr $aj,[$ap],#4 @ ap[j],ap++
|
||
adds $alo,$ahi,$tj @ +=tp[j]
|
||
ldr $nj,[$np],#4 @ np[j],np++
|
||
mov $ahi,#0
|
||
umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
|
||
mov $nhi,#0
|
||
umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
|
||
adc $ahi,$ahi,#0
|
||
ldr $tj,[$tp,#8] @ tp[j+1]
|
||
adds $nlo,$nlo,$alo
|
||
str $nlo,[$tp],#4 @ tp[j-1]=,tp++
|
||
adc $nlo,$nhi,#0
|
||
cmp $tp,$num
|
||
bne .Linner
|
||
|
||
adds $nlo,$nlo,$ahi
|
||
mov $nhi,#0
|
||
ldr $tp,[$_bp] @ restore bp
|
||
adc $nhi,$nhi,#0
|
||
ldr $n0,[$_n0] @ restore n0
|
||
adds $nlo,$nlo,$tj
|
||
ldr $tj,[$_bpend] @ restore &bp[num]
|
||
adc $nhi,$nhi,#0
|
||
str $nlo,[$num] @ tp[num-1]=
|
||
str $nhi,[$num,#4] @ tp[num]=
|
||
|
||
cmp $tp,$tj
|
||
bne .Louter
|
||
|
||
ldr $rp,[$_rp] @ pull rp
|
||
add $num,$num,#4 @ $num to point at &tp[num]
|
||
sub $aj,$num,sp @ "original" num value
|
||
mov $tp,sp @ "rewind" $tp
|
||
mov $ap,$tp @ "borrow" $ap
|
||
sub $np,$np,$aj @ "rewind" $np to &np[0]
|
||
|
||
subs $tj,$tj,$tj @ "clear" carry flag
|
||
.Lsub: ldr $tj,[$tp],#4
|
||
ldr $nj,[$np],#4
|
||
sbcs $tj,$tj,$nj @ tp[j]-np[j]
|
||
str $tj,[$rp],#4 @ rp[j]=
|
||
teq $tp,$num @ preserve carry
|
||
bne .Lsub
|
||
sbcs $nhi,$nhi,#0 @ upmost carry
|
||
mov $tp,sp @ "rewind" $tp
|
||
sub $rp,$rp,$aj @ "rewind" $rp
|
||
|
||
and $ap,$tp,$nhi
|
||
bic $np,$rp,$nhi
|
||
orr $ap,$ap,$np @ ap=borrow?tp:rp
|
||
|
||
.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
|
||
str sp,[$tp],#4 @ zap tp
|
||
str $tj,[$rp],#4
|
||
cmp $tp,$num
|
||
bne .Lcopy
|
||
|
||
add sp,$num,#4 @ skip over tp[num+1]
|
||
ldmia sp!,{r4-r12,lr} @ restore registers
|
||
add sp,sp,#2*4 @ skip over {r0,r2}
|
||
mov r0,#1
|
||
.Labrt: tst lr,#1
|
||
moveq pc,lr @ be binary compatible with V4, yet
|
||
bx lr @ interoperable with Thumb ISA:-)
|
||
.size bn_mul_mont,.-bn_mul_mont
|
||
.asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 2
|
||
___
|
||
|
||
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
|
||
print $code;
|
||
close STDOUT;
|