310 lines
6.8 KiB
Raku
Executable File
310 lines
6.8 KiB
Raku
Executable File
#!/usr/bin/env perl
|
|
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
|
|
# project. Rights for redistribution and usage in source and binary
|
|
# forms are granted according to the OpenSSL license.
|
|
# ====================================================================
|
|
|
|
# I let hardware handle unaligned input(*), except on page boundaries
|
|
# (see below for details). Otherwise straightforward implementation
|
|
# with X vector in register bank. The module is big-endian [which is
|
|
# not big deal as there're no little-endian targets left around].
|
|
#
|
|
# (*) this means that this module is inappropriate for PPC403? Does
|
|
# anybody know if pre-POWER3 can sustain unaligned load?
|
|
|
|
# -m64 -m32
|
|
# ----------------------------------
|
|
# PPC970,gcc-4.0.0 +76% +59%
|
|
|
|
$output = shift;
|
|
|
|
if ($output =~ /64\.s/) {
|
|
$SIZE_T =8;
|
|
$UCMP ="cmpld";
|
|
$STU ="stdu";
|
|
$POP ="ld";
|
|
$PUSH ="std";
|
|
} elsif ($output =~ /32\.s/) {
|
|
$SIZE_T =4;
|
|
$UCMP ="cmplw";
|
|
$STU ="stwu";
|
|
$POP ="lwz";
|
|
$PUSH ="stw";
|
|
} else { die "nonsense $output"; }
|
|
|
|
( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
|
|
die "can't call ../perlasm/ppc-xlate.pl: $!";
|
|
|
|
$FRAME=24*$SIZE_T;
|
|
|
|
$K ="r0";
|
|
$sp ="r1";
|
|
$toc="r2";
|
|
$ctx="r3";
|
|
$inp="r4";
|
|
$num="r5";
|
|
$t0 ="r15";
|
|
$t1 ="r6";
|
|
|
|
$A ="r7";
|
|
$B ="r8";
|
|
$C ="r9";
|
|
$D ="r10";
|
|
$E ="r11";
|
|
$T ="r12";
|
|
|
|
@V=($A,$B,$C,$D,$E,$T);
|
|
@X=("r16","r17","r18","r19","r20","r21","r22","r23",
|
|
"r24","r25","r26","r27","r28","r29","r30","r31");
|
|
|
|
sub BODY_00_19 {
|
|
my ($i,$a,$b,$c,$d,$e,$f)=@_;
|
|
my $j=$i+1;
|
|
$code.=<<___ if ($i==0);
|
|
lwz @X[$i],`$i*4`($inp)
|
|
___
|
|
$code.=<<___ if ($i<15);
|
|
lwz @X[$j],`$j*4`($inp)
|
|
add $f,$K,$e
|
|
rotlwi $e,$a,5
|
|
add $f,$f,@X[$i]
|
|
and $t0,$c,$b
|
|
add $f,$f,$e
|
|
andc $t1,$d,$b
|
|
rotlwi $b,$b,30
|
|
or $t0,$t0,$t1
|
|
add $f,$f,$t0
|
|
___
|
|
$code.=<<___ if ($i>=15);
|
|
add $f,$K,$e
|
|
rotlwi $e,$a,5
|
|
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
|
|
add $f,$f,@X[$i%16]
|
|
and $t0,$c,$b
|
|
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
|
|
add $f,$f,$e
|
|
andc $t1,$d,$b
|
|
rotlwi $b,$b,30
|
|
or $t0,$t0,$t1
|
|
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
|
|
add $f,$f,$t0
|
|
rotlwi @X[$j%16],@X[$j%16],1
|
|
___
|
|
}
|
|
|
|
sub BODY_20_39 {
|
|
my ($i,$a,$b,$c,$d,$e,$f)=@_;
|
|
my $j=$i+1;
|
|
$code.=<<___ if ($i<79);
|
|
add $f,$K,$e
|
|
rotlwi $e,$a,5
|
|
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
|
|
add $f,$f,@X[$i%16]
|
|
xor $t0,$b,$c
|
|
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
|
|
add $f,$f,$e
|
|
rotlwi $b,$b,30
|
|
xor $t0,$t0,$d
|
|
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
|
|
add $f,$f,$t0
|
|
rotlwi @X[$j%16],@X[$j%16],1
|
|
___
|
|
$code.=<<___ if ($i==79);
|
|
add $f,$K,$e
|
|
rotlwi $e,$a,5
|
|
lwz r16,0($ctx)
|
|
add $f,$f,@X[$i%16]
|
|
xor $t0,$b,$c
|
|
lwz r17,4($ctx)
|
|
add $f,$f,$e
|
|
rotlwi $b,$b,30
|
|
lwz r18,8($ctx)
|
|
xor $t0,$t0,$d
|
|
lwz r19,12($ctx)
|
|
add $f,$f,$t0
|
|
lwz r20,16($ctx)
|
|
___
|
|
}
|
|
|
|
sub BODY_40_59 {
|
|
my ($i,$a,$b,$c,$d,$e,$f)=@_;
|
|
my $j=$i+1;
|
|
$code.=<<___;
|
|
add $f,$K,$e
|
|
rotlwi $e,$a,5
|
|
xor @X[$j%16],@X[$j%16],@X[($j+2)%16]
|
|
add $f,$f,@X[$i%16]
|
|
and $t0,$b,$c
|
|
xor @X[$j%16],@X[$j%16],@X[($j+8)%16]
|
|
add $f,$f,$e
|
|
or $t1,$b,$c
|
|
rotlwi $b,$b,30
|
|
xor @X[$j%16],@X[$j%16],@X[($j+13)%16]
|
|
and $t1,$t1,$d
|
|
or $t0,$t0,$t1
|
|
rotlwi @X[$j%16],@X[$j%16],1
|
|
add $f,$f,$t0
|
|
___
|
|
}
|
|
|
|
$code=<<___;
|
|
.text
|
|
|
|
.globl .sha1_block_asm_data_order
|
|
.align 4
|
|
.sha1_block_asm_data_order:
|
|
mflr r0
|
|
$STU $sp,`-($FRAME+64)`($sp)
|
|
$PUSH r0,`$FRAME-$SIZE_T*18`($sp)
|
|
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
|
|
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
|
|
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
|
|
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
|
|
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
|
|
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
|
|
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
|
|
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
|
|
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
|
|
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
|
|
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
|
|
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
|
|
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
|
|
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
|
|
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
|
|
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
|
|
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
|
|
lwz $A,0($ctx)
|
|
lwz $B,4($ctx)
|
|
lwz $C,8($ctx)
|
|
lwz $D,12($ctx)
|
|
lwz $E,16($ctx)
|
|
andi. r0,$inp,3
|
|
bne Lunaligned
|
|
Laligned:
|
|
mtctr $num
|
|
bl Lsha1_block_private
|
|
Ldone:
|
|
$POP r0,`$FRAME-$SIZE_T*18`($sp)
|
|
$POP r15,`$FRAME-$SIZE_T*17`($sp)
|
|
$POP r16,`$FRAME-$SIZE_T*16`($sp)
|
|
$POP r17,`$FRAME-$SIZE_T*15`($sp)
|
|
$POP r18,`$FRAME-$SIZE_T*14`($sp)
|
|
$POP r19,`$FRAME-$SIZE_T*13`($sp)
|
|
$POP r20,`$FRAME-$SIZE_T*12`($sp)
|
|
$POP r21,`$FRAME-$SIZE_T*11`($sp)
|
|
$POP r22,`$FRAME-$SIZE_T*10`($sp)
|
|
$POP r23,`$FRAME-$SIZE_T*9`($sp)
|
|
$POP r24,`$FRAME-$SIZE_T*8`($sp)
|
|
$POP r25,`$FRAME-$SIZE_T*7`($sp)
|
|
$POP r26,`$FRAME-$SIZE_T*6`($sp)
|
|
$POP r27,`$FRAME-$SIZE_T*5`($sp)
|
|
$POP r28,`$FRAME-$SIZE_T*4`($sp)
|
|
$POP r29,`$FRAME-$SIZE_T*3`($sp)
|
|
$POP r30,`$FRAME-$SIZE_T*2`($sp)
|
|
$POP r31,`$FRAME-$SIZE_T*1`($sp)
|
|
mtlr r0
|
|
addi $sp,$sp,`$FRAME+64`
|
|
blr
|
|
___
|
|
|
|
# PowerPC specification allows an implementation to be ill-behaved
|
|
# upon unaligned access which crosses page boundary. "Better safe
|
|
# than sorry" principle makes me treat it specially. But I don't
|
|
# look for particular offending word, but rather for 64-byte input
|
|
# block which crosses the boundary. Once found that block is aligned
|
|
# and hashed separately...
|
|
$code.=<<___;
|
|
.align 4
|
|
Lunaligned:
|
|
subfic $t1,$inp,4096
|
|
andi. $t1,$t1,4095 ; distance to closest page boundary
|
|
srwi. $t1,$t1,6 ; t1/=64
|
|
beq Lcross_page
|
|
$UCMP $num,$t1
|
|
ble- Laligned ; didn't cross the page boundary
|
|
mtctr $t1
|
|
subfc $num,$t1,$num
|
|
bl Lsha1_block_private
|
|
Lcross_page:
|
|
li $t1,16
|
|
mtctr $t1
|
|
addi r20,$sp,$FRAME ; spot below the frame
|
|
Lmemcpy:
|
|
lbz r16,0($inp)
|
|
lbz r17,1($inp)
|
|
lbz r18,2($inp)
|
|
lbz r19,3($inp)
|
|
addi $inp,$inp,4
|
|
stb r16,0(r20)
|
|
stb r17,1(r20)
|
|
stb r18,2(r20)
|
|
stb r19,3(r20)
|
|
addi r20,r20,4
|
|
bdnz Lmemcpy
|
|
|
|
$PUSH $inp,`$FRAME-$SIZE_T*19`($sp)
|
|
li $t1,1
|
|
addi $inp,$sp,$FRAME
|
|
mtctr $t1
|
|
bl Lsha1_block_private
|
|
$POP $inp,`$FRAME-$SIZE_T*19`($sp)
|
|
addic. $num,$num,-1
|
|
bne- Lunaligned
|
|
b Ldone
|
|
___
|
|
|
|
# This is private block function, which uses tailored calling
|
|
# interface, namely upon entry SHA_CTX is pre-loaded to given
|
|
# registers and counter register contains amount of chunks to
|
|
# digest...
|
|
$code.=<<___;
|
|
.align 4
|
|
Lsha1_block_private:
|
|
___
|
|
$code.=<<___; # load K_00_19
|
|
lis $K,0x5a82
|
|
ori $K,$K,0x7999
|
|
___
|
|
for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
|
|
$code.=<<___; # load K_20_39
|
|
lis $K,0x6ed9
|
|
ori $K,$K,0xeba1
|
|
___
|
|
for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
|
$code.=<<___; # load K_40_59
|
|
lis $K,0x8f1b
|
|
ori $K,$K,0xbcdc
|
|
___
|
|
for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
|
|
$code.=<<___; # load K_60_79
|
|
lis $K,0xca62
|
|
ori $K,$K,0xc1d6
|
|
___
|
|
for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
|
|
$code.=<<___;
|
|
add r16,r16,$E
|
|
add r17,r17,$T
|
|
add r18,r18,$A
|
|
add r19,r19,$B
|
|
add r20,r20,$C
|
|
stw r16,0($ctx)
|
|
mr $A,r16
|
|
stw r17,4($ctx)
|
|
mr $B,r17
|
|
stw r18,8($ctx)
|
|
mr $C,r18
|
|
stw r19,12($ctx)
|
|
mr $D,r19
|
|
stw r20,16($ctx)
|
|
mr $E,r20
|
|
addi $inp,$inp,`16*4`
|
|
bdnz- Lsha1_block_private
|
|
blr
|
|
___
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT;
|