PPC assembler pack update from HEAD.

This commit is contained in:
Andy Polyakov 2011-11-14 20:54:17 +00:00
parent 1a111921da
commit cf96d71c22
8 changed files with 940 additions and 410 deletions

View File

@ -7,7 +7,7 @@
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# Needs more work: key setup, page boundaries, CBC routine...
# Needs more work: key setup, CBC routine...
#
# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
# 128-bit key, which is ~40% better than 64-bit code generated by gcc
@ -18,7 +18,7 @@
# February 2010
#
# Rescheduling instructions to favour Power6 pipeline gives 10%
# Rescheduling instructions to favour Power6 pipeline gave 10%
# performance improvement on the platfrom in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
@ -33,11 +33,13 @@ $flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
@ -116,15 +118,19 @@ LAES_Te:
addi $Tbl0,$Tbl0,`128-8`
mtlr r0
blr
.space `32-24`
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
LAES_Td:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
addi $Tbl0,$Tbl0,`128-8-32+2048+256`
addi $Tbl0,$Tbl0,`128-64-8+2048+256`
mtlr r0
blr
.space `128-32-24`
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `128-64-9*4`
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@ -328,10 +334,9 @@ $code.=<<___;
.globl .AES_encrypt
.align 7
.AES_encrypt:
mflr r0
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@ -352,7 +357,14 @@ $code.=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
andi. $t0,$inp,3
andi. $t1,$out,3
or. $t0,$t0,$t1
bne Lenc_unaligned
Lenc_unaligned_ok:
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
@ -363,8 +375,80 @@ $code.=<<___;
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
b Lenc_done
$POP r0,`$FRAME-$SIZE_T*21`($sp)
Lenc_unaligned:
subfic $t0,$inp,4096
subfic $t1,$out,4096
andi. $t0,$t0,4096-16
beq Lenc_xpage
andi. $t1,$t1,4096-16
bne Lenc_unaligned_ok
Lenc_xpage:
lbz $acc00,0($inp)
lbz $acc01,1($inp)
lbz $acc02,2($inp)
lbz $s0,3($inp)
lbz $acc04,4($inp)
lbz $acc05,5($inp)
lbz $acc06,6($inp)
lbz $s1,7($inp)
lbz $acc08,8($inp)
lbz $acc09,9($inp)
lbz $acc10,10($inp)
insrwi $s0,$acc00,8,0
lbz $s2,11($inp)
insrwi $s1,$acc04,8,0
lbz $acc12,12($inp)
insrwi $s0,$acc01,8,8
lbz $acc13,13($inp)
insrwi $s1,$acc05,8,8
lbz $acc14,14($inp)
insrwi $s0,$acc02,8,16
lbz $s3,15($inp)
insrwi $s1,$acc06,8,16
insrwi $s2,$acc08,8,0
insrwi $s3,$acc12,8,0
insrwi $s2,$acc09,8,8
insrwi $s3,$acc13,8,8
insrwi $s2,$acc10,8,16
insrwi $s3,$acc14,8,16
bl LAES_Te
bl Lppc_AES_encrypt_compact
extrwi $acc00,$s0,8,0
extrwi $acc01,$s0,8,8
stb $acc00,0($out)
extrwi $acc02,$s0,8,16
stb $acc01,1($out)
stb $acc02,2($out)
extrwi $acc04,$s1,8,0
stb $s0,3($out)
extrwi $acc05,$s1,8,8
stb $acc04,4($out)
extrwi $acc06,$s1,8,16
stb $acc05,5($out)
stb $acc06,6($out)
extrwi $acc08,$s2,8,0
stb $s1,7($out)
extrwi $acc09,$s2,8,8
stb $acc08,8($out)
extrwi $acc10,$s2,8,16
stb $acc09,9($out)
stb $acc10,10($out)
extrwi $acc12,$s3,8,0
stb $s2,11($out)
extrwi $acc13,$s3,8,8
stb $acc12,12($out)
extrwi $acc14,$s3,8,16
stb $acc13,13($out)
stb $acc14,14($out)
stb $s3,15($out)
Lenc_done:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@ -388,18 +472,21 @@ $code.=<<___;
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
.align 5
Lppc_AES_encrypt:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
lwz $t0,0($key)
addi $Tbl2,$Tbl0,2
lwz $t1,4($key)
addi $Tbl3,$Tbl0,1
lwz $t2,8($key)
addi $acc00,$acc00,-1
lwz $t3,12($key)
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
@ -413,44 +500,44 @@ Lenc_loop:
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc04,$s1,`32-16+3`,21,28
lwz $t1,4($key)
rlwinm $acc05,$s2,`32-16+3`,21,28
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc06,$s3,`32-16+3`,21,28
lwz $t3,12($key)
rlwinm $acc07,$s0,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
lwzx $acc01,$Tbl0,$acc01
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
lwzx $acc03,$Tbl0,$acc03
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0+3`,21,28
lwzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s0,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0+3`,21,28
lwzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s2,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
lwzx $acc09,$Tbl2,$acc09
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
lwzx $acc11,$Tbl2,$acc11
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
lwzx $acc13,$Tbl3,$acc13
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
lwzx $acc15,$Tbl3,$acc15
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
@ -466,60 +553,60 @@ Lenc_loop:
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
lwz $t1,4($key)
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
lwz $t3,12($key)
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s1,`32-16`,24,31
lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc05,$s2,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc06,$s3,`32-16`,24,31
lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc07,$s0,`32-16`,24,31
lwz $acc12,`2048+128`($Tbl0)
lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc09,$s3,`32-8`,24,31
lwz $acc14,`2048+192`($Tbl0)
lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc10,$s0,`32-8`,24,31
lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc00,$Tbl2,$acc00
lbzx $acc01,$Tbl2,$acc01
rlwinm $acc12,$s3,`0`,24,31
lbzx $acc01,$Tbl2,$acc01
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc02,$Tbl2,$acc02
lbzx $acc03,$Tbl2,$acc03
rlwinm $acc14,$s1,`0`,24,31
lbzx $acc03,$Tbl2,$acc03
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc04,$Tbl2,$acc04
lbzx $acc05,$Tbl2,$acc05
rlwinm $s0,$acc00,24,0,7
lbzx $acc05,$Tbl2,$acc05
rlwinm $s1,$acc01,24,0,7
lbzx $acc06,$Tbl2,$acc06
lbzx $acc07,$Tbl2,$acc07
rlwinm $s2,$acc02,24,0,7
lbzx $acc07,$Tbl2,$acc07
rlwinm $s3,$acc03,24,0,7
lbzx $acc08,$Tbl2,$acc08
lbzx $acc09,$Tbl2,$acc09
rlwimi $s0,$acc04,16,8,15
lbzx $acc09,$Tbl2,$acc09
rlwimi $s1,$acc05,16,8,15
lbzx $acc10,$Tbl2,$acc10
lbzx $acc11,$Tbl2,$acc11
rlwimi $s2,$acc06,16,8,15
lbzx $acc11,$Tbl2,$acc11
rlwimi $s3,$acc07,16,8,15
lbzx $acc12,$Tbl2,$acc12
lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc08,8,16,23
lbzx $acc13,$Tbl2,$acc13
rlwimi $s1,$acc09,8,16,23
lbzx $acc14,$Tbl2,$acc14
lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc10,8,16,23
lbzx $acc15,$Tbl2,$acc15
rlwimi $s3,$acc11,8,16,23
or $s0,$s0,$acc12
or $s1,$s1,$acc13
@ -530,29 +617,31 @@ Lenc_loop:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_encrypt_compact:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
lwz $t0,0($key)
lis $mask80,0x8080
lwz $t1,4($key)
lis $mask1b,0x1b1b
addi $key,$key,16
lwz $t2,8($key)
ori $mask80,$mask80,0x8080
lwz $t3,12($key)
ori $mask1b,$mask1b,0x1b1b
addi $key,$key,16
mtctr $acc00
.align 4
Lenc_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
xor $s2,$s2,$t2
rlwinm $acc01,$s1,`32-24`,24,31
xor $s3,$s3,$t3
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s1,`32-16`,24,31
@ -560,48 +649,48 @@ Lenc_compact_loop:
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
lbzx $acc01,$Tbl1,$acc01
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
lbzx $acc03,$Tbl1,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0`,24,31
lbzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0`,24,31
lbzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
lbzx $acc09,$Tbl1,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
lbzx $acc11,$Tbl1,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
lbzx $acc13,$Tbl1,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
lbzx $acc15,$Tbl1,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
lwz $t1,4($key)
or $s0,$s0,$acc12
lwz $t1,4($key)
or $s1,$s1,$acc13
lwz $t2,8($key)
lwz $t3,12($key)
or $s2,$s2,$acc14
lwz $t3,12($key)
or $s3,$s3,$acc15
addi $key,$key,16
@ -612,12 +701,12 @@ Lenc_compact_loop:
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
srwi $acc05,$acc01,7
srwi $acc06,$acc02,7
srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
srwi $acc05,$acc01,7
andc $acc09,$s1,$mask80
srwi $acc06,$acc02,7
andc $acc10,$s2,$mask80
srwi $acc07,$acc03,7
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
@ -633,32 +722,32 @@ Lenc_compact_loop:
and $acc03,$acc03,$mask1b
xor $acc00,$acc00,$acc08 # r2
xor $acc01,$acc01,$acc09
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
rotlwi $acc12,$s0,16 # ROTATE(r0,16)
xor $acc02,$acc02,$acc10
rotlwi $acc13,$s1,16
xor $acc03,$acc03,$acc11
rotlwi $acc14,$s2,16
rotlwi $acc15,$s3,16
xor $s0,$s0,$acc00 # r0^r2
rotlwi $acc15,$s3,16
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
xor $s2,$s2,$acc02
rotrwi $s1,$s1,24
xor $s3,$s3,$acc03
rotrwi $s2,$s2,24
rotrwi $s3,$s3,24
xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
rotrwi $s3,$s3,24
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
rotlwi $acc09,$acc13,8
rotlwi $acc10,$acc14,8
rotlwi $acc11,$acc15,8
xor $s0,$s0,$acc12 #
rotlwi $acc09,$acc13,8
xor $s1,$s1,$acc13
rotlwi $acc10,$acc14,8
xor $s2,$s2,$acc14
rotlwi $acc11,$acc15,8
xor $s3,$s3,$acc15
xor $s0,$s0,$acc08 #
xor $s1,$s1,$acc09
@ -673,14 +762,15 @@ Lenc_compact_done:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .AES_decrypt
.align 7
.AES_decrypt:
mflr r0
$STU $sp,-$FRAME($sp)
mflr r0
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@ -701,7 +791,14 @@ Lenc_compact_done:
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
andi. $t0,$inp,3
andi. $t1,$out,3
or. $t0,$t0,$t1
bne Ldec_unaligned
Ldec_unaligned_ok:
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
@ -712,8 +809,80 @@ Lenc_compact_done:
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
b Ldec_done
$POP r0,`$FRAME-$SIZE_T*21`($sp)
Ldec_unaligned:
subfic $t0,$inp,4096
subfic $t1,$out,4096
andi. $t0,$t0,4096-16
beq Ldec_xpage
andi. $t1,$t1,4096-16
bne Ldec_unaligned_ok
Ldec_xpage:
lbz $acc00,0($inp)
lbz $acc01,1($inp)
lbz $acc02,2($inp)
lbz $s0,3($inp)
lbz $acc04,4($inp)
lbz $acc05,5($inp)
lbz $acc06,6($inp)
lbz $s1,7($inp)
lbz $acc08,8($inp)
lbz $acc09,9($inp)
lbz $acc10,10($inp)
insrwi $s0,$acc00,8,0
lbz $s2,11($inp)
insrwi $s1,$acc04,8,0
lbz $acc12,12($inp)
insrwi $s0,$acc01,8,8
lbz $acc13,13($inp)
insrwi $s1,$acc05,8,8
lbz $acc14,14($inp)
insrwi $s0,$acc02,8,16
lbz $s3,15($inp)
insrwi $s1,$acc06,8,16
insrwi $s2,$acc08,8,0
insrwi $s3,$acc12,8,0
insrwi $s2,$acc09,8,8
insrwi $s3,$acc13,8,8
insrwi $s2,$acc10,8,16
insrwi $s3,$acc14,8,16
bl LAES_Td
bl Lppc_AES_decrypt_compact
extrwi $acc00,$s0,8,0
extrwi $acc01,$s0,8,8
stb $acc00,0($out)
extrwi $acc02,$s0,8,16
stb $acc01,1($out)
stb $acc02,2($out)
extrwi $acc04,$s1,8,0
stb $s0,3($out)
extrwi $acc05,$s1,8,8
stb $acc04,4($out)
extrwi $acc06,$s1,8,16
stb $acc05,5($out)
stb $acc06,6($out)
extrwi $acc08,$s2,8,0
stb $s1,7($out)
extrwi $acc09,$s2,8,8
stb $acc08,8($out)
extrwi $acc10,$s2,8,16
stb $acc09,9($out)
stb $acc10,10($out)
extrwi $acc12,$s3,8,0
stb $s2,11($out)
extrwi $acc13,$s3,8,8
stb $acc12,12($out)
extrwi $acc14,$s3,8,16
stb $acc13,13($out)
stb $acc14,14($out)
stb $s3,15($out)
Ldec_done:
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@ -737,18 +906,21 @@ Lenc_compact_done:
mtlr r0
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
.align 5
Lppc_AES_decrypt:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
lwz $t0,0($key)
addi $Tbl2,$Tbl0,2
lwz $t1,4($key)
addi $Tbl3,$Tbl0,1
lwz $t2,8($key)
addi $acc00,$acc00,-1
lwz $t3,12($key)
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
@ -762,44 +934,44 @@ Ldec_loop:
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc04,$s3,`32-16+3`,21,28
lwz $t1,4($key)
rlwinm $acc05,$s0,`32-16+3`,21,28
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc06,$s1,`32-16+3`,21,28
lwz $t3,12($key)
rlwinm $acc07,$s2,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
lwzx $acc01,$Tbl0,$acc01
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
lwzx $acc03,$Tbl0,$acc03
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0+3`,21,28
lwzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s2,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0+3`,21,28
lwzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s0,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
lwzx $acc09,$Tbl2,$acc09
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
lwzx $acc11,$Tbl2,$acc11
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
lwzx $acc13,$Tbl3,$acc13
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
lwzx $acc15,$Tbl3,$acc15
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
@ -815,56 +987,56 @@ Ldec_loop:
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
lwz $t1,4($key)
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
lwz $t3,12($key)
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s3,`32-16`,24,31
lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc05,$s0,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
lwz $acc11,`2048+96`($Tbl0)
lbzx $acc00,$Tbl2,$acc00
lwz $acc11,`2048+96`($Tbl0)
lbzx $acc01,$Tbl2,$acc01
lwz $acc12,`2048+128`($Tbl0)
lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc06,$s1,`32-16`,24,31
lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc07,$s2,`32-16`,24,31
lwz $acc14,`2048+192`($Tbl0)
lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl2,$acc02
lbzx $acc03,$Tbl2,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
lbzx $acc03,$Tbl2,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl2,$acc04
lbzx $acc05,$Tbl2,$acc05
rlwinm $acc12,$s1,`0`,24,31
lbzx $acc05,$Tbl2,$acc05
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl2,$acc06
lbzx $acc07,$Tbl2,$acc07
rlwinm $acc14,$s3,`0`,24,31
lbzx $acc07,$Tbl2,$acc07
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl2,$acc08
lbzx $acc09,$Tbl2,$acc09
rlwinm $s0,$acc00,24,0,7
lbzx $acc09,$Tbl2,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl2,$acc10
lbzx $acc11,$Tbl2,$acc11
rlwinm $s2,$acc02,24,0,7
lbzx $acc11,$Tbl2,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl2,$acc12
lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc04,16,8,15
lbzx $acc13,$Tbl2,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl2,$acc14
lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc06,16,8,15
lbzx $acc15,$Tbl2,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
@ -879,20 +1051,22 @@ Ldec_loop:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.align 4
Lppc_AES_decrypt_compact:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
lwz $t0,0($key)
lis $mask80,0x8080
lwz $t1,4($key)
lis $mask1b,0x1b1b
addi $key,$key,16
lwz $t2,8($key)
ori $mask80,$mask80,0x8080
lwz $t3,12($key)
ori $mask1b,$mask1b,0x1b1b
addi $key,$key,16
___
$code.=<<___ if ($SIZE_T==8);
insrdi $mask80,$mask80,32,0
@ -904,10 +1078,10 @@ $code.=<<___;
Ldec_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
xor $s2,$s2,$t2
rlwinm $acc01,$s1,`32-24`,24,31
xor $s3,$s3,$t3
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s3,`32-16`,24,31
@ -915,48 +1089,48 @@ Ldec_compact_loop:
rlwinm $acc06,$s1,`32-16`,24,31
rlwinm $acc07,$s2,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
lbzx $acc01,$Tbl1,$acc01
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
lbzx $acc03,$Tbl1,$acc03
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0`,24,31
lbzx $acc05,$Tbl1,$acc05
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0`,24,31
lbzx $acc07,$Tbl1,$acc07
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
lbzx $acc09,$Tbl1,$acc09
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
lbzx $acc11,$Tbl1,$acc11
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
lbzx $acc13,$Tbl1,$acc13
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
lbzx $acc15,$Tbl1,$acc15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
lwz $t1,4($key)
or $s0,$s0,$acc12
lwz $t1,4($key)
or $s1,$s1,$acc13
lwz $t2,8($key)
lwz $t3,12($key)
or $s2,$s2,$acc14
lwz $t3,12($key)
or $s3,$s3,$acc15
addi $key,$key,16
@ -1030,12 +1204,12 @@ $code.=<<___ if ($SIZE_T==4);
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
srwi $acc05,$acc01,7
srwi $acc06,$acc02,7
srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
srwi $acc05,$acc01,7
andc $acc09,$s1,$mask80
srwi $acc06,$acc02,7
andc $acc10,$s2,$mask80
srwi $acc07,$acc03,7
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
@ -1059,12 +1233,12 @@ $code.=<<___ if ($SIZE_T==4);
and $acc06,$acc02,$mask80
and $acc07,$acc03,$mask80
srwi $acc08,$acc04,7 # r1>>7
srwi $acc09,$acc05,7
srwi $acc10,$acc06,7
srwi $acc11,$acc07,7
andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
srwi $acc09,$acc05,7
andc $acc13,$acc01,$mask80
srwi $acc10,$acc06,7
andc $acc14,$acc02,$mask80
srwi $acc11,$acc07,7
andc $acc15,$acc03,$mask80
sub $acc04,$acc04,$acc08 # r1-(r1>>7)
sub $acc05,$acc05,$acc09
@ -1085,13 +1259,13 @@ $code.=<<___ if ($SIZE_T==4);
and $acc08,$acc04,$mask80 # r1=r4&0x80808080
and $acc09,$acc05,$mask80
and $acc10,$acc06,$mask80
and $acc11,$acc07,$mask80
srwi $acc12,$acc08,7 # r1>>7
and $acc10,$acc06,$mask80
srwi $acc13,$acc09,7
and $acc11,$acc07,$mask80
srwi $acc14,$acc10,7
srwi $acc15,$acc11,7
sub $acc08,$acc08,$acc12 # r1-(r1>>7)
srwi $acc15,$acc11,7
sub $acc09,$acc09,$acc13
sub $acc10,$acc10,$acc14
sub $acc11,$acc11,$acc15
@ -1124,10 +1298,10 @@ ___
$code.=<<___;
rotrwi $s0,$s0,8 # = ROTATE(r0,8)
rotrwi $s1,$s1,8
rotrwi $s2,$s2,8
rotrwi $s3,$s3,8
xor $s0,$s0,$acc00 # ^= r2^r0
rotrwi $s2,$s2,8
xor $s1,$s1,$acc01
rotrwi $s3,$s3,8
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
xor $acc00,$acc00,$acc08
@ -1135,32 +1309,32 @@ $code.=<<___;
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
xor $s0,$s0,$acc04 # ^= r4^r0
xor $s1,$s1,$acc05
xor $s2,$s2,$acc06
xor $s3,$s3,$acc07
rotrwi $acc00,$acc00,24
xor $s1,$s1,$acc05
rotrwi $acc01,$acc01,24
xor $s2,$s2,$acc06
rotrwi $acc02,$acc02,24
xor $s3,$s3,$acc07
rotrwi $acc03,$acc03,24
xor $acc04,$acc04,$acc08
xor $acc05,$acc05,$acc09
xor $acc06,$acc06,$acc10
xor $acc07,$acc07,$acc11
xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
rotrwi $acc04,$acc04,16
xor $s1,$s1,$acc09
rotrwi $acc05,$acc05,16
xor $s2,$s2,$acc10
rotrwi $acc06,$acc06,16
xor $s3,$s3,$acc11
rotrwi $acc07,$acc07,16
xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotrwi $acc08,$acc08,8
xor $s1,$s1,$acc01
rotrwi $acc09,$acc09,8
xor $s2,$s2,$acc02
rotrwi $acc10,$acc10,8
xor $s3,$s3,$acc03
rotrwi $acc11,$acc11,8
xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
xor $s1,$s1,$acc05
@ -1179,7 +1353,9 @@ Ldec_compact_done:
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.long 0
.byte 0,12,0x14,0,0,0,0,0
.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
.align 7
___

View File

@ -31,7 +31,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=4;
$RZONE= 224;
$FRAME= $SIZE_T*16;
$LD= "lwz"; # load
$LDU= "lwzu"; # load and update
@ -51,7 +50,6 @@ if ($flavour =~ /32/) {
$BNSZ= $BITS/8;
$SIZE_T=8;
$RZONE= 288;
$FRAME= $SIZE_T*16;
# same as above, but 64-bit mnemonics...
$LD= "ld"; # load
@ -69,6 +67,9 @@ if ($flavour =~ /32/) {
$POP= $LD;
} else { die "nonsense $flavour"; }
$FRAME=8*$SIZE_T+$RZONE;
$LOCALS=8*$SIZE_T;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
@ -89,18 +90,18 @@ $aj="r10";
$nj="r11";
$tj="r12";
# non-volatile registers
$i="r14";
$j="r15";
$tp="r16";
$m0="r17";
$m1="r18";
$lo0="r19";
$hi0="r20";
$lo1="r21";
$hi1="r22";
$alo="r23";
$ahi="r24";
$nlo="r25";
$i="r20";
$j="r21";
$tp="r22";
$m0="r23";
$m1="r24";
$lo0="r25";
$hi0="r26";
$lo1="r27";
$hi1="r28";
$alo="r29";
$ahi="r30";
$nlo="r31";
#
$nhi="r0";
@ -108,42 +109,48 @@ $code=<<___;
.machine "any"
.text
.globl .bn_mul_mont
.globl .bn_mul_mont_int
.align 4
.bn_mul_mont:
.bn_mul_mont_int:
cmpwi $num,4
mr $rp,r3 ; $rp is reassigned
li r3,0
bltlr
___
$code.=<<___ if ($BNSZ==4);
cmpwi $num,32 ; longer key performance is not better
bgelr
___
$code.=<<___;
slwi $num,$num,`log($BNSZ)/log(2)`
li $tj,-4096
addi $ovf,$num,`$FRAME+$RZONE`
addi $ovf,$num,$FRAME
subf $ovf,$ovf,$sp ; $sp-$ovf
and $ovf,$ovf,$tj ; minimize TLB usage
subf $ovf,$sp,$ovf ; $ovf-$sp
mr $tj,$sp
srwi $num,$num,`log($BNSZ)/log(2)`
$STUX $sp,$sp,$ovf
$PUSH r14,`4*$SIZE_T`($sp)
$PUSH r15,`5*$SIZE_T`($sp)
$PUSH r16,`6*$SIZE_T`($sp)
$PUSH r17,`7*$SIZE_T`($sp)
$PUSH r18,`8*$SIZE_T`($sp)
$PUSH r19,`9*$SIZE_T`($sp)
$PUSH r20,`10*$SIZE_T`($sp)
$PUSH r21,`11*$SIZE_T`($sp)
$PUSH r22,`12*$SIZE_T`($sp)
$PUSH r23,`13*$SIZE_T`($sp)
$PUSH r24,`14*$SIZE_T`($sp)
$PUSH r25,`15*$SIZE_T`($sp)
$PUSH r20,`-12*$SIZE_T`($tj)
$PUSH r21,`-11*$SIZE_T`($tj)
$PUSH r22,`-10*$SIZE_T`($tj)
$PUSH r23,`-9*$SIZE_T`($tj)
$PUSH r24,`-8*$SIZE_T`($tj)
$PUSH r25,`-7*$SIZE_T`($tj)
$PUSH r26,`-6*$SIZE_T`($tj)
$PUSH r27,`-5*$SIZE_T`($tj)
$PUSH r28,`-4*$SIZE_T`($tj)
$PUSH r29,`-3*$SIZE_T`($tj)
$PUSH r30,`-2*$SIZE_T`($tj)
$PUSH r31,`-1*$SIZE_T`($tj)
$LD $n0,0($n0) ; pull n0[0] value
addi $num,$num,-2 ; adjust $num for counter register
$LD $m0,0($bp) ; m0=bp[0]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$FRAME
addi $tp,$sp,$LOCALS
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
$UMULH $hi0,$aj,$m0
@ -205,8 +212,8 @@ L1st:
Louter:
$LDX $m0,$bp,$i ; m0=bp[i]
$LD $aj,0($ap) ; ap[0]
addi $tp,$sp,$FRAME
$LD $tj,$FRAME($sp) ; tp[0]
addi $tp,$sp,$LOCALS
$LD $tj,$LOCALS($sp); tp[0]
$UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
$UMULH $hi0,$aj,$m0
$LD $aj,$BNSZ($ap) ; ap[1]
@ -273,7 +280,7 @@ Linner:
addi $num,$num,2 ; restore $num
subfc $j,$j,$j ; j=0 and "clear" XER[CA]
addi $tp,$sp,$FRAME
addi $tp,$sp,$LOCALS
mtctr $num
.align 4
@ -299,23 +306,27 @@ Lcopy: ; copy or in-place refresh
addi $j,$j,$BNSZ
bdnz- Lcopy
$POP r14,`4*$SIZE_T`($sp)
$POP r15,`5*$SIZE_T`($sp)
$POP r16,`6*$SIZE_T`($sp)
$POP r17,`7*$SIZE_T`($sp)
$POP r18,`8*$SIZE_T`($sp)
$POP r19,`9*$SIZE_T`($sp)
$POP r20,`10*$SIZE_T`($sp)
$POP r21,`11*$SIZE_T`($sp)
$POP r22,`12*$SIZE_T`($sp)
$POP r23,`13*$SIZE_T`($sp)
$POP r24,`14*$SIZE_T`($sp)
$POP r25,`15*$SIZE_T`($sp)
$POP $sp,0($sp)
$POP $tj,0($sp)
li r3,1
$POP r20,`-12*$SIZE_T`($tj)
$POP r21,`-11*$SIZE_T`($tj)
$POP r22,`-10*$SIZE_T`($tj)
$POP r23,`-9*$SIZE_T`($tj)
$POP r24,`-8*$SIZE_T`($tj)
$POP r25,`-7*$SIZE_T`($tj)
$POP r26,`-6*$SIZE_T`($tj)
$POP r27,`-5*$SIZE_T`($tj)
$POP r28,`-4*$SIZE_T`($tj)
$POP r29,`-3*$SIZE_T`($tj)
$POP r30,`-2*$SIZE_T`($tj)
$POP r31,`-1*$SIZE_T`($tj)
mr $sp,$tj
blr
.long 0
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
.byte 0,12,4,0,0x80,12,6,0
.long 0
.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -389,7 +389,9 @@ $data=<<EOF;
$ST r9,`6*$BNSZ`(r3) #r[6]=c1
$ST r10,`7*$BNSZ`(r3) #r[7]=c2
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
#
# NOTE: The following label name should be changed to
@ -814,8 +816,9 @@ $data=<<EOF;
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
#
# NOTE: The following label name should be changed to
@ -966,7 +969,9 @@ $data=<<EOF;
$ST r10,`6*$BNSZ`(r3) #r[6]=c1
$ST r11,`7*$BNSZ`(r3) #r[7]=c2
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1502,7 +1507,9 @@ $data=<<EOF;
$ST r12,`14*$BNSZ`(r3) #r[14]=c3;
$ST r10,`15*$BNSZ`(r3) #r[15]=c1;
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1550,8 +1557,9 @@ Lppcasm_sub_adios:
subfze r3,r0 # if carry bit is set then r3 = 0 else -1
andi. r3,r3,1 # keep only last bit.
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1594,7 +1602,9 @@ Lppcasm_add_mainloop:
Lppcasm_add_adios:
addze r3,r0 #return carry bit.
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1707,7 +1717,9 @@ Lppcasm_div8:
Lppcasm_div9:
or r3,r8,r0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1746,8 +1758,9 @@ Lppcasm_sqr_mainloop:
bdnz- Lppcasm_sqr_mainloop
Lppcasm_sqr_adios:
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1850,7 +1863,9 @@ Lppcasm_mw_REM:
Lppcasm_mw_OVER:
addi r3,r12,0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
#
# NOTE: The following label name should be changed to
@ -1973,7 +1988,9 @@ Lppcasm_maw_leftover:
Lppcasm_maw_adios:
addi r3,r12,0
blr
.long 0x00000000
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.align 4
EOF
$data =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -45,23 +45,40 @@
# on 1.8GHz PPC970, it's only 5-55% faster. Still far from impressive
# in absolute terms, but it's apparently the way Power 6 is...
# December 2009
# Adapted for 32-bit build this module delivers 25-120%, yes, more
# than *twice* for longer keys, performance improvement over 32-bit
# ppc-mont.pl on 1.8GHz PPC970. However! This implementation utilizes
# even 64-bit integer operations and the trouble is that most PPC
# operating systems don't preserve upper halves of general purpose
# registers upon 32-bit signal delivery. They do preserve them upon
# context switch, but not signalling:-( This means that asynchronous
# signals have to be blocked upon entry to this subroutine. Signal
# masking (and of course complementary unmasking) has quite an impact
# on performance, naturally larger for shorter keys. It's so severe
# that 512-bit key performance can be as low as 1/3 of expected one.
# This is why this routine can be engaged for longer key operations
# only on these OSes, see crypto/ppccap.c for further details. MacOS X
# is an exception from this and doesn't require signal masking, and
# that's where above improvement coefficients were collected. For
# others alternative would be to break dependence on upper halves of
# GPRs by sticking to 32-bit integer operations...
$flavour = shift;
if ($flavour =~ /32/) {
$SIZE_T=4;
$RZONE= 224;
$FRAME= $SIZE_T*12+8*12;
$fname= "bn_mul_mont_ppc64";
$fname= "bn_mul_mont_fpu64";
$STUX= "stwux"; # store indexed and update
$PUSH= "stw";
$POP= "lwz";
die "not implemented yet";
} elsif ($flavour =~ /64/) {
$SIZE_T=8;
$RZONE= 288;
$FRAME= $SIZE_T*12+8*12;
$fname= "bn_mul_mont";
$fname= "bn_mul_mont_fpu64";
# same as above, but 64-bit mnemonics...
$STUX= "stdux"; # store indexed and update
@ -76,7 +93,7 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=($FRAME+63)&~63;
$FRAME=64; # padded frame header
$TRANSFER=16*8;
$carry="r0";
@ -93,16 +110,16 @@ $tp="r10";
$j="r11";
$i="r12";
# non-volatile registers
$nap_d="r14"; # interleaved ap and np in double format
$a0="r15"; # ap[0]
$t0="r16"; # temporary registers
$t1="r17";
$t2="r18";
$t3="r19";
$t4="r20";
$t5="r21";
$t6="r22";
$t7="r23";
$nap_d="r22"; # interleaved ap and np in double format
$a0="r23"; # ap[0]
$t0="r24"; # temporary registers
$t1="r25";
$t2="r26";
$t3="r27";
$t4="r28";
$t5="r29";
$t6="r30";
$t7="r31";
# PPC offers enough register bank capacity to unroll inner loops twice
#
@ -132,28 +149,17 @@ $ba="f0"; $bb="f1"; $bc="f2"; $bd="f3";
$na="f4"; $nb="f5"; $nc="f6"; $nd="f7";
$dota="f8"; $dotb="f9";
$A0="f10"; $A1="f11"; $A2="f12"; $A3="f13";
$N0="f14"; $N1="f15"; $N2="f16"; $N3="f17";
$T0a="f18"; $T0b="f19";
$T1a="f20"; $T1b="f21";
$T2a="f22"; $T2b="f23";
$T3a="f24"; $T3b="f25";
$N0="f20"; $N1="f21"; $N2="f22"; $N3="f23";
$T0a="f24"; $T0b="f25";
$T1a="f26"; $T1b="f27";
$T2a="f28"; $T2b="f29";
$T3a="f30"; $T3b="f31";
# sp----------->+-------------------------------+
# | saved sp |
# +-------------------------------+
# | |
# +-------------------------------+
# | 10 saved gpr, r14-r23 |
# . .
# . .
# +12*size_t +-------------------------------+
# | 12 saved fpr, f14-f25 |
# . .
# . .
# +12*8 +-------------------------------+
# | padding to 64 byte boundary |
# . .
# +X +-------------------------------+
# +64 +-------------------------------+
# | 16 gpr<->fpr transfer zone |
# . .
# . .
@ -173,6 +179,16 @@ $T3a="f24"; $T3b="f25";
# . .
# . .
# +-------------------------------+
# . .
# -12*size_t +-------------------------------+
# | 10 saved gpr, r22-r31 |
# . .
# . .
# -12*8 +-------------------------------+
# | 12 saved fpr, f20-f31 |
# . .
# . .
# +-------------------------------+
$code=<<___;
.machine "any"
@ -181,14 +197,14 @@ $code=<<___;
.globl .$fname
.align 5
.$fname:
cmpwi $num,4
cmpwi $num,`3*8/$SIZE_T`
mr $rp,r3 ; $rp is reassigned
li r3,0 ; possible "not handled" return code
bltlr-
andi. r0,$num,1 ; $num has to be even
andi. r0,$num,`16/$SIZE_T-1` ; $num has to be "even"
bnelr-
slwi $num,$num,3 ; num*=8
slwi $num,$num,`log($SIZE_T)/log(2)` ; num*=sizeof(BN_LONG)
li $i,-4096
slwi $tp,$num,2 ; place for {an}p_{lh}[num], i.e. 4*num
add $tp,$tp,$num ; place for tp[num+1]
@ -196,35 +212,50 @@ $code=<<___;
subf $tp,$tp,$sp ; $sp-$tp
and $tp,$tp,$i ; minimize TLB usage
subf $tp,$sp,$tp ; $tp-$sp
mr $i,$sp
$STUX $sp,$sp,$tp ; alloca
$PUSH r14,`2*$SIZE_T`($sp)
$PUSH r15,`3*$SIZE_T`($sp)
$PUSH r16,`4*$SIZE_T`($sp)
$PUSH r17,`5*$SIZE_T`($sp)
$PUSH r18,`6*$SIZE_T`($sp)
$PUSH r19,`7*$SIZE_T`($sp)
$PUSH r20,`8*$SIZE_T`($sp)
$PUSH r21,`9*$SIZE_T`($sp)
$PUSH r22,`10*$SIZE_T`($sp)
$PUSH r23,`11*$SIZE_T`($sp)
stfd f14,`12*$SIZE_T+0`($sp)
stfd f15,`12*$SIZE_T+8`($sp)
stfd f16,`12*$SIZE_T+16`($sp)
stfd f17,`12*$SIZE_T+24`($sp)
stfd f18,`12*$SIZE_T+32`($sp)
stfd f19,`12*$SIZE_T+40`($sp)
stfd f20,`12*$SIZE_T+48`($sp)
stfd f21,`12*$SIZE_T+56`($sp)
stfd f22,`12*$SIZE_T+64`($sp)
stfd f23,`12*$SIZE_T+72`($sp)
stfd f24,`12*$SIZE_T+80`($sp)
stfd f25,`12*$SIZE_T+88`($sp)
$PUSH r22,`-12*8-10*$SIZE_T`($i)
$PUSH r23,`-12*8-9*$SIZE_T`($i)
$PUSH r24,`-12*8-8*$SIZE_T`($i)
$PUSH r25,`-12*8-7*$SIZE_T`($i)
$PUSH r26,`-12*8-6*$SIZE_T`($i)
$PUSH r27,`-12*8-5*$SIZE_T`($i)
$PUSH r28,`-12*8-4*$SIZE_T`($i)
$PUSH r29,`-12*8-3*$SIZE_T`($i)
$PUSH r30,`-12*8-2*$SIZE_T`($i)
$PUSH r31,`-12*8-1*$SIZE_T`($i)
stfd f20,`-12*8`($i)
stfd f21,`-11*8`($i)
stfd f22,`-10*8`($i)
stfd f23,`-9*8`($i)
stfd f24,`-8*8`($i)
stfd f25,`-7*8`($i)
stfd f26,`-6*8`($i)
stfd f27,`-5*8`($i)
stfd f28,`-4*8`($i)
stfd f29,`-3*8`($i)
stfd f30,`-2*8`($i)
stfd f31,`-1*8`($i)
___
$code.=<<___ if ($SIZE_T==8);
ld $a0,0($ap) ; pull ap[0] value
ld $n0,0($n0) ; pull n0[0] value
ld $t3,0($bp) ; bp[0]
___
$code.=<<___ if ($SIZE_T==4);
mr $t1,$n0
lwz $a0,0($ap) ; pull ap[0,1] value
lwz $t0,4($ap)
lwz $n0,0($t1) ; pull n0[0,1] value
lwz $t1,4($t1)
lwz $t3,0($bp) ; bp[0,1]
lwz $t2,4($bp)
insrdi $a0,$t0,32,0
insrdi $n0,$t1,32,0
insrdi $t3,$t2,32,0
___
$code.=<<___;
addi $tp,$sp,`$FRAME+$TRANSFER+8+64`
li $i,-64
add $nap_d,$tp,$num
@ -258,6 +289,8 @@ $code=<<___;
std $t5,`$FRAME+40`($sp)
std $t6,`$FRAME+48`($sp)
std $t7,`$FRAME+56`($sp)
___
$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@ -266,6 +299,18 @@ $code=<<___;
lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np)
___
$code.=<<___ if ($SIZE_T==4);
lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
lwz $t1,4($ap)
lwz $t2,8($ap)
lwz $t3,12($ap)
lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
lwz $t5,4($np)
lwz $t6,8($np)
lwz $t7,12($np)
___
$code.=<<___;
lfd $ba,`$FRAME+0`($sp)
lfd $bb,`$FRAME+8`($sp)
lfd $bc,`$FRAME+16`($sp)
@ -374,6 +419,8 @@ $code=<<___;
.align 5
L1st:
___
$code.=<<___ if ($SIZE_T==8);
lwz $t0,4($ap) ; load a[j] as 32-bit word pair
lwz $t1,0($ap)
lwz $t2,12($ap) ; load a[j+1] as 32-bit word pair
@ -382,6 +429,18 @@ L1st:
lwz $t5,0($np)
lwz $t6,12($np) ; load n[j+1] as 32-bit word pair
lwz $t7,8($np)
___
$code.=<<___ if ($SIZE_T==4);
lwz $t0,0($ap) ; load a[j..j+3] as 32-bit word pairs
lwz $t1,4($ap)
lwz $t2,8($ap)
lwz $t3,12($ap)
lwz $t4,0($np) ; load n[j..j+3] as 32-bit word pairs
lwz $t5,4($np)
lwz $t6,8($np)
lwz $t7,12($np)
___
$code.=<<___;
std $t0,`$FRAME+64`($sp)
std $t1,`$FRAME+72`($sp)
std $t2,`$FRAME+80`($sp)
@ -559,7 +618,17 @@ L1st:
li $i,8 ; i=1
.align 5
Louter:
___
$code.=<<___ if ($SIZE_T==8);
ldx $t3,$bp,$i ; bp[i]
___
$code.=<<___ if ($SIZE_T==4);
add $t0,$bp,$i
lwz $t3,0($t0) ; bp[i,i+1]
lwz $t0,4($t0)
insrdi $t3,$t0,32,0
___
$code.=<<___;
ld $t6,`$FRAME+$TRANSFER+8`($sp) ; tp[0]
mulld $t7,$a0,$t3 ; ap[0]*bp[i]
@ -761,6 +830,13 @@ Linner:
stfd $T0b,`$FRAME+8`($sp)
add $t7,$t7,$carry
addc $t3,$t0,$t1
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t0,$t0,32,0
extrdi $t1,$t1,32,0
adde $t0,$t0,$t1
___
$code.=<<___;
stfd $T1a,`$FRAME+16`($sp)
stfd $T1b,`$FRAME+24`($sp)
insrdi $t4,$t7,16,0 ; 64..127 bits
@ -768,6 +844,13 @@ Linner:
stfd $T2a,`$FRAME+32`($sp)
stfd $T2b,`$FRAME+40`($sp)
adde $t5,$t4,$t2
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t4,$t4,32,0
extrdi $t2,$t2,32,0
adde $t4,$t4,$t2
___
$code.=<<___;
stfd $T3a,`$FRAME+48`($sp)
stfd $T3b,`$FRAME+56`($sp)
addze $carry,$carry
@ -816,7 +899,21 @@ Linner:
ld $t7,`$FRAME+72`($sp)
addc $t3,$t0,$t1
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t0,$t0,32,0
extrdi $t1,$t1,32,0
adde $t0,$t0,$t1
___
$code.=<<___;
adde $t5,$t4,$t2
___
$code.=<<___ if ($SIZE_T==4); # adjust XER[CA]
extrdi $t4,$t4,32,0
extrdi $t2,$t2,32,0
adde $t4,$t4,$t2
___
$code.=<<___;
addze $carry,$carry
std $t3,-16($tp) ; tp[j-1]
@ -835,7 +932,9 @@ Linner:
subf $nap_d,$t7,$nap_d ; rewind pointer
cmpw $i,$num
blt- Louter
___
$code.=<<___ if ($SIZE_T==8);
subf $np,$num,$np ; rewind np
addi $j,$j,1 ; restore counter
subfc $i,$i,$i ; j=0 and "clear" XER[CA]
@ -883,34 +982,105 @@ Lcopy: ; copy or in-place refresh
stdx $i,$t4,$i
addi $i,$i,16
bdnz- Lcopy
___
$code.=<<___ if ($SIZE_T==4);
subf $np,$num,$np ; rewind np
addi $j,$j,1 ; restore counter
subfc $i,$i,$i ; j=0 and "clear" XER[CA]
addi $tp,$sp,`$FRAME+$TRANSFER`
addi $np,$np,-4
addi $rp,$rp,-4
addi $ap,$sp,`$FRAME+$TRANSFER+4`
mtctr $j
.align 4
Lsub: ld $t0,8($tp) ; load tp[j..j+3] in 64-bit word order
ldu $t2,16($tp)
lwz $t4,4($np) ; load np[j..j+3] in 32-bit word order
lwz $t5,8($np)
lwz $t6,12($np)
lwzu $t7,16($np)
extrdi $t1,$t0,32,0
extrdi $t3,$t2,32,0
subfe $t4,$t4,$t0 ; tp[j]-np[j]
stw $t0,4($ap) ; save tp[j..j+3] in 32-bit word order
subfe $t5,$t5,$t1 ; tp[j+1]-np[j+1]
stw $t1,8($ap)
subfe $t6,$t6,$t2 ; tp[j+2]-np[j+2]
stw $t2,12($ap)
subfe $t7,$t7,$t3 ; tp[j+3]-np[j+3]
stwu $t3,16($ap)
stw $t4,4($rp)
stw $t5,8($rp)
stw $t6,12($rp)
stwu $t7,16($rp)
bdnz- Lsub
li $i,0
subfe $ovf,$i,$ovf ; handle upmost overflow bit
addi $tp,$sp,`$FRAME+$TRANSFER+4`
subf $rp,$num,$rp ; rewind rp
and $ap,$tp,$ovf
andc $np,$rp,$ovf
or $ap,$ap,$np ; ap=borrow?tp:rp
addi $tp,$sp,`$FRAME+$TRANSFER`
mtctr $j
.align 4
Lcopy: ; copy or in-place refresh
lwz $t0,4($ap)
lwz $t1,8($ap)
lwz $t2,12($ap)
lwzu $t3,16($ap)
std $i,8($nap_d) ; zap nap_d
std $i,16($nap_d)
std $i,24($nap_d)
std $i,32($nap_d)
std $i,40($nap_d)
std $i,48($nap_d)
std $i,56($nap_d)
stdu $i,64($nap_d)
stw $t0,4($rp)
stw $t1,8($rp)
stw $t2,12($rp)
stwu $t3,16($rp)
std $i,8($tp) ; zap tp at once
stdu $i,16($tp)
bdnz- Lcopy
___
$POP r14,`2*$SIZE_T`($sp)
$POP r15,`3*$SIZE_T`($sp)
$POP r16,`4*$SIZE_T`($sp)
$POP r17,`5*$SIZE_T`($sp)
$POP r18,`6*$SIZE_T`($sp)
$POP r19,`7*$SIZE_T`($sp)
$POP r20,`8*$SIZE_T`($sp)
$POP r21,`9*$SIZE_T`($sp)
$POP r22,`10*$SIZE_T`($sp)
$POP r23,`11*$SIZE_T`($sp)
lfd f14,`12*$SIZE_T+0`($sp)
lfd f15,`12*$SIZE_T+8`($sp)
lfd f16,`12*$SIZE_T+16`($sp)
lfd f17,`12*$SIZE_T+24`($sp)
lfd f18,`12*$SIZE_T+32`($sp)
lfd f19,`12*$SIZE_T+40`($sp)
lfd f20,`12*$SIZE_T+48`($sp)
lfd f21,`12*$SIZE_T+56`($sp)
lfd f22,`12*$SIZE_T+64`($sp)
lfd f23,`12*$SIZE_T+72`($sp)
lfd f24,`12*$SIZE_T+80`($sp)
lfd f25,`12*$SIZE_T+88`($sp)
$POP $sp,0($sp)
$code.=<<___;
$POP $i,0($sp)
li r3,1 ; signal "handled"
$POP r22,`-12*8-10*$SIZE_T`($i)
$POP r23,`-12*8-9*$SIZE_T`($i)
$POP r24,`-12*8-8*$SIZE_T`($i)
$POP r25,`-12*8-7*$SIZE_T`($i)
$POP r26,`-12*8-6*$SIZE_T`($i)
$POP r27,`-12*8-5*$SIZE_T`($i)
$POP r28,`-12*8-4*$SIZE_T`($i)
$POP r29,`-12*8-3*$SIZE_T`($i)
$POP r30,`-12*8-2*$SIZE_T`($i)
$POP r31,`-12*8-1*$SIZE_T`($i)
lfd f20,`-12*8`($i)
lfd f21,`-11*8`($i)
lfd f22,`-10*8`($i)
lfd f23,`-9*8`($i)
lfd f24,`-8*8`($i)
lfd f25,`-7*8`($i)
lfd f26,`-6*8`($i)
lfd f27,`-5*8`($i)
lfd f28,`-4*8`($i)
lfd f29,`-3*8`($i)
lfd f30,`-2*8`($i)
lfd f31,`-1*8`($i)
mr $sp,$i
blr
.long 0
.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@fy.chalmers.se>"
.byte 0,12,4,0,0x8c,10,6,0
.long 0
.asciz "Montgomery Multiplication for PPC64, CRYPTOGAMS by <appro\@openssl.org>"
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

115
crypto/ppccap.c Normal file
View File

@ -0,0 +1,115 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <setjmp.h>
#include <signal.h>
#include <crypto.h>
#include <openssl/bn.h>
#define PPC_FPU64 (1<<0)
#define PPC_ALTIVEC (1<<1)
static int OPENSSL_ppccap_P = 0;
static sigset_t all_masked;
#ifdef OPENSSL_BN_ASM_MONT
int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num)
{
int bn_mul_mont_fpu64(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
int bn_mul_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np, const BN_ULONG *n0, int num);
if (sizeof(size_t)==4)
{
#if (defined(__APPLE__) && defined(__MACH__))
if (num>=8 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
#else
/* boundary of 32 was experimentally determined on
Linux 2.6.22, might have to be adjusted on AIX... */
if (num>=32 && (num&3)==0 && (OPENSSL_ppccap_P&PPC_FPU64))
{
sigset_t oset;
int ret;
sigprocmask(SIG_SETMASK,&all_masked,&oset);
ret=bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
sigprocmask(SIG_SETMASK,&oset,NULL);
return ret;
}
#endif
}
else if ((OPENSSL_ppccap_P&PPC_FPU64))
/* this is a "must" on POWER6, but run-time detection
* is not implemented yet... */
return bn_mul_mont_fpu64(rp,ap,bp,np,n0,num);
return bn_mul_mont_int(rp,ap,bp,np,n0,num);
}
#endif
static sigjmp_buf ill_jmp;
static void ill_handler (int sig) { siglongjmp(ill_jmp,sig); }
void OPENSSL_ppc64_probe(void);
void OPENSSL_cpuid_setup(void)
{
char *e;
struct sigaction ill_oact,ill_act;
sigset_t oset;
static int trigger=0;
if (trigger) return;
trigger=1;
sigfillset(&all_masked);
sigdelset(&all_masked,SIGILL);
sigdelset(&all_masked,SIGTRAP);
#ifdef SIGEMT
sigdelset(&all_masked,SIGEMT);
#endif
sigdelset(&all_masked,SIGFPE);
sigdelset(&all_masked,SIGBUS);
sigdelset(&all_masked,SIGSEGV);
if ((e=getenv("OPENSSL_ppccap")))
{
OPENSSL_ppccap_P=strtoul(e,NULL,0);
return;
}
OPENSSL_ppccap_P = 0;
memset(&ill_act,0,sizeof(ill_act));
ill_act.sa_handler = ill_handler;
ill_act.sa_mask = all_masked;
sigprocmask(SIG_SETMASK,&ill_act.sa_mask,&oset);
sigaction(SIGILL,&ill_act,&ill_oact);
if (sizeof(size_t)==4)
{
if (sigsetjmp(ill_jmp,1) == 0)
{
OPENSSL_ppc64_probe();
OPENSSL_ppccap_P |= PPC_FPU64;
}
}
else
{
/*
* Wanted code detecting POWER6 CPU and setting PPC_FPU64
*/
}
if (sigsetjmp(ill_jmp,1) == 0)
{
OPENSSL_altivec_probe();
OPENSSL_ppccap_P |= PPC_ALTIVEC;
}
sigaction (SIGILL,&ill_oact,NULL);
sigprocmask(SIG_SETMASK,&oset,NULL);
}

View File

@ -23,36 +23,67 @@ $code=<<___;
.machine "any"
.text
.globl .OPENSSL_cpuid_setup
.globl .OPENSSL_ppc64_probe
.align 4
.OPENSSL_cpuid_setup:
.OPENSSL_ppc64_probe:
fcfid f1,f1
extrdi r0,r0,32,0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_altivec_probe
.align 4
.OPENSSL_altivec_probe:
.long 0x10000484 # vor v0,v0,v0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_wipe_cpu
.align 4
.OPENSSL_wipe_cpu:
xor r0,r0,r0
fmr f0,f31
fmr f1,f31
fmr f2,f31
mr r3,r1
fmr f3,f31
xor r4,r4,r4
fmr f4,f31
xor r5,r5,r5
fmr f5,f31
xor r6,r6,r6
fmr f6,f31
xor r7,r7,r7
fmr f7,f31
xor r8,r8,r8
fmr f8,f31
xor r9,r9,r9
fmr f9,f31
xor r10,r10,r10
fmr f10,f31
xor r11,r11,r11
fmr f11,f31
xor r12,r12,r12
fmr f12,f31
fmr f13,f31
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_atomic_add
.align 4
.OPENSSL_atomic_add:
Loop: lwarx r5,0,r3
Ladd: lwarx r5,0,r3
add r0,r4,r5
stwcx. r0,0,r3
bne- Loop
bne- Ladd
$SIGNX r3,r0
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.globl .OPENSSL_rdtsc
.align 4
@ -60,6 +91,8 @@ Loop: lwarx r5,0,r3
mftb r3
mftbu r4
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.globl .OPENSSL_cleanse
.align 4
@ -89,6 +122,9 @@ Laligned:
andi. r4,r4,3
bne Little
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -24,12 +24,14 @@ $flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$UCMP ="cmpld";
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$UCMP ="cmplw";
$STU ="stwu";
$POP ="lwz";
@ -43,7 +45,8 @@ die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=24*$SIZE_T;
$FRAME=24*$SIZE_T+64;
$LOCALS=6*$SIZE_T;
$K ="r0";
$sp ="r1";
@ -162,9 +165,8 @@ $code=<<___;
.globl .sha1_block_data_order
.align 4
.sha1_block_data_order:
$STU $sp,-$FRAME($sp)
mflr r0
$STU $sp,`-($FRAME+64)`($sp)
$PUSH r0,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
@ -182,6 +184,7 @@ $code=<<___;
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
lwz $A,0($ctx)
lwz $B,4($ctx)
lwz $C,8($ctx)
@ -192,8 +195,53 @@ $code=<<___;
Laligned:
mtctr $num
bl Lsha1_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for 64-byte input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,4095 ; distance to closest page boundary
srwi. $t1,$t1,6 ; t1/=64
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
mtctr $t1
subfc $num,$t1,$num
bl Lsha1_block_private
Lcross_page:
li $t1,16
mtctr $t1
addi r20,$sp,$LOCALS ; spot within the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*18`($sp)
li $t1,1
addi $inp,$sp,$LOCALS
mtctr $t1
bl Lsha1_block_private
$POP $inp,`$FRAME-$SIZE_T*18`($sp)
addic. $num,$num,-1
bne- Lunaligned
Ldone:
$POP r0,`$FRAME-$SIZE_T*18`($sp)
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
@ -212,54 +260,11 @@ Ldone:
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,`$FRAME+64`
addi $sp,$sp,$FRAME
blr
___
# PowerPC specification allows an implementation to be ill-behaved
# upon unaligned access which crosses page boundary. "Better safe
# than sorry" principle makes me treat it specially. But I don't
# look for particular offending word, but rather for 64-byte input
# block which crosses the boundary. Once found that block is aligned
# and hashed separately...
$code.=<<___;
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,4095 ; distance to closest page boundary
srwi. $t1,$t1,6 ; t1/=64
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
mtctr $t1
subfc $num,$t1,$num
bl Lsha1_block_private
Lcross_page:
li $t1,16
mtctr $t1
addi r20,$sp,$FRAME ; spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
li $t1,1
addi $inp,$sp,$FRAME
mtctr $t1
bl Lsha1_block_private
$POP $inp,`$FRAME-$SIZE_T*19`($sp)
addic. $num,$num,-1
bne- Lunaligned
b Ldone
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
___
# This is private block function, which uses tailored calling
@ -309,6 +314,8 @@ $code.=<<___;
addi $inp,$inp,`16*4`
bdnz- Lsha1_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
$code.=<<___;
.asciz "SHA1 block transform for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"

View File

@ -40,6 +40,7 @@ $output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$UCMP="cmpld";
$SHL="sldi";
@ -47,6 +48,7 @@ if ($flavour =~ /64/) {
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$UCMP="cmplw";
$SHL="slwi";
@ -87,7 +89,8 @@ if ($output =~ /512/) {
$SHR="srwi";
}
$FRAME=32*$SIZE_T;
$FRAME=32*$SIZE_T+16*$SZ;
$LOCALS=6*$SIZE_T;
$sp ="r1";
$toc="r2";
@ -179,13 +182,12 @@ $code=<<___;
.globl $func
.align 6
$func:
$STU $sp,-$FRAME($sp)
mflr r0
$STU $sp,`-($FRAME+16*$SZ)`($sp)
$SHL $num,$num,`log(16*$SZ)/log(2)`
$PUSH $ctx,`$FRAME-$SIZE_T*22`($sp)
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
@ -206,6 +208,7 @@ $func:
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
$PUSH r0,`$FRAME+$LRSAVE`($sp)
$LD $A,`0*$SZ`($ctx)
mr $inp,r4 ; incarnate $inp
@ -217,7 +220,7 @@ $func:
$LD $G,`6*$SZ`($ctx)
$LD $H,`7*$SZ`($ctx)
b LPICmeup
bl LPICmeup
LPICedup:
andi. r0,$inp,3
bne Lunaligned
@ -226,8 +229,60 @@ Laligned:
$PUSH $num,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
b Ldone
; PowerPC specification allows an implementation to be ill-behaved
; upon unaligned access which crosses page boundary. "Better safe
; than sorry" principle makes me treat it specially. But I don't
; look for particular offending word, but rather for the input
; block which crosses the boundary. Once found that block is aligned
; and hashed separately...
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
subfc $num,$t1,$num
add $t1,$inp,$t1
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
; $inp equals to the intermediate end pointer here
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
addi r20,$sp,$LOCALS ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$LOCALS+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$LOCALS ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
$POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne- Lunaligned
Ldone:
$POP r0,`$FRAME-$SIZE_T*21`($sp)
$POP r0,`$FRAME+$LRSAVE`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
@ -249,64 +304,12 @@ Ldone:
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,`$FRAME+16*$SZ`
addi $sp,$sp,$FRAME
blr
___
.long 0
.byte 0,12,4,1,0x80,18,3,0
.long 0
# PowerPC specification allows an implementation to be ill-behaved
# upon unaligned access which crosses page boundary. "Better safe
# than sorry" principle makes me treat it specially. But I don't
# look for particular offending word, but rather for the input
# block which crosses the boundary. Once found that block is aligned
# and hashed separately...
$code.=<<___;
.align 4
Lunaligned:
subfic $t1,$inp,4096
andi. $t1,$t1,`4096-16*$SZ` ; distance to closest page boundary
beq Lcross_page
$UCMP $num,$t1
ble- Laligned ; didn't cross the page boundary
subfc $num,$t1,$num
add $t1,$inp,$t1
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real remaining num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; intermediate end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
; $inp equals to the intermediate end pointer here
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real remaining num
Lcross_page:
li $t1,`16*$SZ/4`
mtctr $t1
addi r20,$sp,$FRAME ; aligned spot below the frame
Lmemcpy:
lbz r16,0($inp)
lbz r17,1($inp)
lbz r18,2($inp)
lbz r19,3($inp)
addi $inp,$inp,4
stb r16,0(r20)
stb r17,1(r20)
stb r18,2(r20)
stb r19,3(r20)
addi r20,r20,4
bdnz Lmemcpy
$PUSH $inp,`$FRAME-$SIZE_T*26`($sp) ; save real inp
addi $t1,$sp,`$FRAME+16*$SZ` ; fictitious end pointer
addi $inp,$sp,$FRAME ; fictitious inp pointer
$PUSH $num,`$FRAME-$SIZE_T*25`($sp) ; save real num
$PUSH $t1,`$FRAME-$SIZE_T*24`($sp) ; end pointer
$PUSH $inp,`$FRAME-$SIZE_T*23`($sp) ; inp pointer
bl Lsha2_block_private
$POP $inp,`$FRAME-$SIZE_T*26`($sp) ; restore real inp
$POP $num,`$FRAME-$SIZE_T*25`($sp) ; restore real num
addic. $num,$num,`-16*$SZ` ; num--
bne- Lunaligned
b Ldone
___
$code.=<<___;
.align 4
Lsha2_block_private:
___
@ -372,6 +375,8 @@ $code.=<<___;
$ST $H,`7*$SZ`($ctx)
bne Lsha2_block_private
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
# Ugly hack here, because PPC assembler syntax seem to vary too
@ -379,22 +384,15 @@ ___
$code.=<<___;
.align 6
LPICmeup:
bl LPIC
addi $Tbl,$Tbl,`64-4` ; "distance" between . and last nop
b LPICedup
nop
nop
nop
nop
nop
LPIC: mflr $Tbl
mflr r0
bcl 20,31,\$+4
mflr $Tbl ; vvvvvv "distance" between . and 1st data entry
addi $Tbl,$Tbl,`64-8`
mtlr r0
blr
nop
nop
nop
nop
nop
nop
.long 0
.byte 0,12,0x14,0,0,0,0,0
.space `64-9*4`
___
$code.=<<___ if ($SZ==8);
.long 0x428a2f98,0xd728ae22,0x71374491,0x23ef65cd