3e181369dd
tested, because kernel is not in shape to handle it *yet*. The code is committed mostly to stimulate the kernel development.
147 lines
3.5 KiB
Raku
147 lines
3.5 KiB
Raku
#!/usr/bin/env perl
|
||
#
|
||
# ====================================================================
|
||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
# ====================================================================
|
||
#
|
||
# February 2012
|
||
#
|
||
# The module implements bn_GF2m_mul_2x2 polynomial multiplication
|
||
# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
|
||
# C for the time being... The subroutine runs in 37 cycles, which is
|
||
# 4.5x faster than compiler-generated code. Though comparison is
|
||
# totally unfair, because this module utilizes Galois Field Multiply
|
||
# instruction.
|
||
|
||
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
|
||
open STDOUT,">$output";
|
||
|
||
($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8"); # argument vector
|
||
|
||
($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
|
||
($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
|
||
($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
|
||
($A,$B)=($Alo,$B_1);
|
||
$xFF="B1";
|
||
|
||
sub mul_1x1_upper {
|
||
my ($A,$B)=@_;
|
||
$code.=<<___;
|
||
EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
|
||
|| AND $B,$xFF,$B_0
|
||
|| SHRU $B,24,$B_3
|
||
SHRU $A,16, $Ahi ; smash $A to two halfwords
|
||
|| EXTU $A,16,16,$Alo
|
||
|
||
XORMPY $Alo,$B_2,$Alox2 ; 16x8 bits muliplication
|
||
|| XORMPY $Ahi,$B_2,$Ahix2
|
||
|| EXTU $B,16,24,$B_1
|
||
XORMPY $Alo,$B_0,$Alox0
|
||
|| XORMPY $Ahi,$B_0,$Ahix0
|
||
XORMPY $Alo,$B_3,$Alox3
|
||
|| XORMPY $Ahi,$B_3,$Ahix3
|
||
XORMPY $Alo,$B_1,$Alox1
|
||
|| XORMPY $Ahi,$B_1,$Ahix1
|
||
___
|
||
}
|
||
sub mul_1x1_merged {
|
||
my ($OUTlo,$OUThi,$A,$B)=@_;
|
||
$code.=<<___;
|
||
EXTU $B,8,24,$B_2 ; smash $B to 4 bytes
|
||
|| AND $B,$xFF,$B_0
|
||
|| SHRU $B,24,$B_3
|
||
SHRU $A,16, $Ahi ; smash $A to two halfwords
|
||
|| EXTU $A,16,16,$Alo
|
||
|
||
XOR $Ahix0,$Alox2,$Ahix0
|
||
|| MV $Ahix2,$OUThi
|
||
|| XORMPY $Alo,$B_2,$Alox2
|
||
XORMPY $Ahi,$B_2,$Ahix2
|
||
|| EXTU $B,16,24,$B_1
|
||
|| XORMPY $Alo,$B_0,A1 ; $Alox0
|
||
XOR $Ahix1,$Alox3,$Ahix1
|
||
|| SHL $Ahix0,16,$OUTlo
|
||
|| SHRU $Ahix0,16,$Ahix0
|
||
XOR $Alox0,$OUTlo,$OUTlo
|
||
|| XOR $Ahix0,$OUThi,$OUThi
|
||
|| XORMPY $Ahi,$B_0,$Ahix0
|
||
|| XORMPY $Alo,$B_3,$Alox3
|
||
|| SHL $Alox1,8,$Alox1
|
||
|| SHL $Ahix3,8,$Ahix3
|
||
XOR $Alox1,$OUTlo,$OUTlo
|
||
|| XOR $Ahix3,$OUThi,$OUThi
|
||
|| XORMPY $Ahi,$B_3,$Ahix3
|
||
|| SHL $Ahix1,24,$Alox1
|
||
|| SHRU $Ahix1,8, $Ahix1
|
||
XOR $Alox1,$OUTlo,$OUTlo
|
||
|| XOR $Ahix1,$OUThi,$OUThi
|
||
|| XORMPY $Alo,$B_1,$Alox1
|
||
|| XORMPY $Ahi,$B_1,$Ahix1
|
||
|| MV A1,$Alox0
|
||
___
|
||
}
|
||
sub mul_1x1_lower {
|
||
my ($OUTlo,$OUThi)=@_;
|
||
$code.=<<___;
|
||
;NOP
|
||
XOR $Ahix0,$Alox2,$Ahix0
|
||
|| MV $Ahix2,$OUThi
|
||
NOP
|
||
XOR $Ahix1,$Alox3,$Ahix1
|
||
|| SHL $Ahix0,16,$OUTlo
|
||
|| SHRU $Ahix0,16,$Ahix0
|
||
XOR $Alox0,$OUTlo,$OUTlo
|
||
|| XOR $Ahix0,$OUThi,$OUThi
|
||
|| SHL $Alox1,8,$Alox1
|
||
|| SHL $Ahix3,8,$Ahix3
|
||
XOR $Alox1,$OUTlo,$OUTlo
|
||
|| XOR $Ahix3,$OUThi,$OUThi
|
||
|| SHL $Ahix1,24,$Alox1
|
||
|| SHRU $Ahix1,8, $Ahix1
|
||
XOR $Alox1,$OUTlo,$OUTlo
|
||
|| XOR $Ahix1,$OUThi,$OUThi
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
.text
|
||
|
||
.global _bn_GF2m_mul_2x2
|
||
_bn_GF2m_mul_2x2:
|
||
.asmfunc
|
||
MVK 0xFF,$xFF
|
||
___
|
||
&mul_1x1_upper($a0,$b0); # a0<61>b0
|
||
$code.=<<___;
|
||
|| MV $b1,$B
|
||
MV $a1,$A
|
||
___
|
||
&mul_1x1_merged("A28","B28",$A,$B); # a0<61>b0/a1<61>b1
|
||
$code.=<<___;
|
||
|| XOR $b0,$b1,$B
|
||
XOR $a0,$a1,$A
|
||
___
|
||
&mul_1x1_merged("A31","B31",$A,$B); # a1<61>b1/(a0+a1)<29>(b0+b1)
|
||
$code.=<<___;
|
||
XOR A28,A31,A29
|
||
|| XOR B28,B31,B29 ; a0<EFBFBD>b0+a1<EFBFBD>b1
|
||
___
|
||
&mul_1x1_lower("A30","B30"); # (a0+a1)<29>(b0+b1)
|
||
$code.=<<___;
|
||
|| BNOP B3
|
||
XOR A29,A30,A30
|
||
|| XOR B29,B30,B30 ; (a0+a1)<EFBFBD>(b0+b1)-a0<EFBFBD>b0-a1<EFBFBD>b1
|
||
XOR B28,A30,A30
|
||
|| STW A28,*${rp}[0]
|
||
XOR B30,A31,A31
|
||
|| STW A30,*${rp}[1]
|
||
STW A31,*${rp}[2]
|
||
STW B31,*${rp}[3]
|
||
.endasmfunc
|
||
___
|
||
|
||
print $code;
|
||
close STDOUT;
|