Add DES SPARC T4 module from master.

This commit is contained in:
Andy Polyakov 2013-05-19 23:51:22 +02:00
parent 14ef63c15e
commit 9c1ee1bed5
4 changed files with 614 additions and 10 deletions

View File

@ -130,7 +130,7 @@ my $x86_elf_asm="$x86_asm:elf";
my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o vpaes-x86_64.o bsaes-x86_64.o aesni-x86_64.o aesni-sha1-x86_64.o aesni-sha256-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o rc4-md5-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o aesni-gcm-x86_64.o:";
my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o::void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o::void";
my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::::void";
my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o::void";
my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o::::::::";

18
TABLE
View File

@ -175,7 +175,7 @@ $lflags =
$bn_ops = BN_LLONG RC2_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC2 BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -2716,7 +2716,7 @@ $lflags = -lsocket -lnsl -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -2749,7 +2749,7 @@ $lflags = -lsocket -lnsl -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -4432,7 +4432,7 @@ $lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -4663,7 +4663,7 @@ $lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -5521,7 +5521,7 @@ $lflags = -lsocket -lnsl -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -5554,7 +5554,7 @@ $lflags = -lsocket -lnsl -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -5653,7 +5653,7 @@ $lflags = -lsocket -lnsl -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =
@ -5686,7 +5686,7 @@ $lflags = -lsocket -lnsl -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR
$cpuid_obj = sparcv9cap.o sparccpuid.o
$bn_obj = bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o
$des_obj = des_enc-sparc.o fcrypt_b.o
$des_obj = des_enc-sparc.o fcrypt_b.o dest4-sparcv9.o
$aes_obj = aes_core.o aes_cbc.o aes-sparcv9.o
$bf_obj =
$md5_obj =

View File

@ -61,6 +61,8 @@ des: des.o cbc3_enc.o lib
des_enc-sparc.S: asm/des_enc.m4
m4 -B 8192 asm/des_enc.m4 > des_enc-sparc.S
dest4-sparcv9.s: asm/dest4-sparcv9.pl
$(PERL) asm/dest4-sparcv9.pl $(CFLAGS) > $@
des-586.s: asm/des-586.pl ../perlasm/x86asm.pl ../perlasm/cbc.pl
$(PERL) asm/des-586.pl $(PERLASM_SCHEME) $(CFLAGS) > $@

View File

@ -0,0 +1,602 @@
#!/usr/bin/env perl
# ====================================================================
# Written by David S. Miller <davem@devemloft.net> and Andy Polyakov
# <appro@openssl.org>. The module is licensed under 2-clause BSD
# license. March 2013. All rights reserved.
# ====================================================================
######################################################################
# DES for SPARC T4.
#
# As with other hardware-assisted ciphers CBC encrypt results [for
# aligned data] are virtually identical to critical path lengths:
#
# DES Triple-DES
# CBC encrypt 4.14/4.15(*) 11.7/11.7
# CBC decrypt 1.77/4.11(**) 6.42/7.47
#
# (*) numbers after slash are for
# misaligned data;
# (**) this is result for largest
# block size, unlike all other
# cases smaller blocks results
# are better[?];
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
&asm_init(@ARGV);
$code.=<<___ if ($::abibits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
.text
___
{ my ($inp,$out)=("%o0","%o1");
$code.=<<___;
.align 32
.globl des_t4_key_expand
.type des_t4_key_expand,#function
des_t4_key_expand:
andcc $inp, 0x7, %g0
alignaddr $inp, %g0, $inp
bz,pt %icc, 1f
ldd [$inp + 0x00], %f0
ldd [$inp + 0x08], %f2
faligndata %f0, %f2, %f0
1: des_kexpand %f0, 0, %f0
des_kexpand %f0, 1, %f2
std %f0, [$out + 0x00]
des_kexpand %f2, 3, %f6
std %f2, [$out + 0x08]
des_kexpand %f2, 2, %f4
des_kexpand %f6, 3, %f10
std %f6, [$out + 0x18]
des_kexpand %f6, 2, %f8
std %f4, [$out + 0x10]
des_kexpand %f10, 3, %f14
std %f10, [$out + 0x28]
des_kexpand %f10, 2, %f12
std %f8, [$out + 0x20]
des_kexpand %f14, 1, %f16
std %f14, [$out + 0x38]
des_kexpand %f16, 3, %f20
std %f12, [$out + 0x30]
des_kexpand %f16, 2, %f18
std %f16, [$out + 0x40]
des_kexpand %f20, 3, %f24
std %f20, [$out + 0x50]
des_kexpand %f20, 2, %f22
std %f18, [$out + 0x48]
des_kexpand %f24, 3, %f28
std %f24, [$out + 0x60]
des_kexpand %f24, 2, %f26
std %f22, [$out + 0x58]
des_kexpand %f28, 1, %f30
std %f28, [$out + 0x70]
std %f26, [$out + 0x68]
retl
std %f30, [$out + 0x78]
.size des_t4_key_expand,.-des_t4_key_expand
___
}
{ my ($inp,$out,$len,$key,$ivec) = map("%o$_",(0..4));
my ($ileft,$iright,$omask) = map("%g$_",(1..3));
$code.=<<___;
.globl des_t4_cbc_encrypt
.align 32
des_t4_cbc_encrypt:
ld [$ivec + 0], %f0 ! load ivec
ld [$ivec + 4], %f1
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x00], %f4 ! load key schedule
ldd [$key + 0x08], %f6
ldd [$key + 0x10], %f8
ldd [$key + 0x18], %f10
ldd [$key + 0x20], %f12
ldd [$key + 0x28], %f14
ldd [$key + 0x30], %f16
ldd [$key + 0x38], %f18
ldd [$key + 0x40], %f20
ldd [$key + 0x48], %f22
ldd [$key + 0x50], %f24
ldd [$key + 0x58], %f26
ldd [$key + 0x60], %f28
ldd [$key + 0x68], %f30
ldd [$key + 0x70], %f32
ldd [$key + 0x78], %f34
.Ldes_cbc_enc_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f2
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
fxor %f2, %f0, %f0 ! ^= ivec
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
des_round %f20, %f22, %f0, %f0
des_round %f24, %f26, %f0, %f0
des_round %f28, %f30, %f0, %f0
des_round %f32, %f34, %f0, %f0
des_iip %f0, %f0
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_cbc_enc_loop
add $out, 8, $out
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~4x deterioration
! in inp==out case
faligndata %f0, %f0, %f2 ! handle unaligned output
stda %f8, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f8, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_cbc_enc_loop+4
orn %g0, $omask, $omask
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.type des_t4_cbc_encrypt,#function
.size des_t4_cbc_encrypt,.-des_t4_cbc_encrypt
.globl des_t4_cbc_decrypt
.align 32
des_t4_cbc_decrypt:
ld [$ivec + 0], %f2 ! load ivec
ld [$ivec + 4], %f3
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x78], %f4 ! load key schedule
ldd [$key + 0x70], %f6
ldd [$key + 0x68], %f8
ldd [$key + 0x60], %f10
ldd [$key + 0x58], %f12
ldd [$key + 0x50], %f14
ldd [$key + 0x48], %f16
ldd [$key + 0x40], %f18
ldd [$key + 0x38], %f20
ldd [$key + 0x30], %f22
ldd [$key + 0x28], %f24
ldd [$key + 0x20], %f26
ldd [$key + 0x18], %f28
ldd [$key + 0x10], %f30
ldd [$key + 0x08], %f32
ldd [$key + 0x00], %f34
.Ldes_cbc_dec_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f0
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
des_round %f20, %f22, %f0, %f0
des_round %f24, %f26, %f0, %f0
des_round %f28, %f30, %f0, %f0
des_round %f32, %f34, %f0, %f0
des_iip %f0, %f0
fxor %f2, %f0, %f0 ! ^= ivec
movxtod %g4, %f2
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_cbc_dec_loop
add $out, 8, $out
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~4x deterioration
! in inp==out case
faligndata %f0, %f0, %f0 ! handle unaligned output
stda %f0, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f0, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_cbc_dec_loop+4
orn %g0, $omask, $omask
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.type des_t4_cbc_decrypt,#function
.size des_t4_cbc_decrypt,.-des_t4_cbc_decrypt
___
# One might wonder why does one have back-to-back des_iip/des_ip
# pairs between EDE passes. Indeed, aren't they inverse of each other?
# They almost are. Outcome of the pair is 32-bit words being swapped
# in target register. Consider pair of des_iip/des_ip as a way to
# perform the due swap, it's actually fastest way in this case.
$code.=<<___;
.globl des_t4_ede3_cbc_encrypt
.align 32
des_t4_ede3_cbc_encrypt:
ld [$ivec + 0], %f0 ! load ivec
ld [$ivec + 4], %f1
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x00], %f4 ! load key schedule
ldd [$key + 0x08], %f6
ldd [$key + 0x10], %f8
ldd [$key + 0x18], %f10
ldd [$key + 0x20], %f12
ldd [$key + 0x28], %f14
ldd [$key + 0x30], %f16
ldd [$key + 0x38], %f18
ldd [$key + 0x40], %f20
ldd [$key + 0x48], %f22
ldd [$key + 0x50], %f24
ldd [$key + 0x58], %f26
ldd [$key + 0x60], %f28
ldd [$key + 0x68], %f30
ldd [$key + 0x70], %f32
ldd [$key + 0x78], %f34
.Ldes_ede3_cbc_enc_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f2
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
fxor %f2, %f0, %f0 ! ^= ivec
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
ldd [$key + 0x100-0x08], %f36
ldd [$key + 0x100-0x10], %f38
des_round %f20, %f22, %f0, %f0
ldd [$key + 0x100-0x18], %f40
ldd [$key + 0x100-0x20], %f42
des_round %f24, %f26, %f0, %f0
ldd [$key + 0x100-0x28], %f44
ldd [$key + 0x100-0x30], %f46
des_round %f28, %f30, %f0, %f0
ldd [$key + 0x100-0x38], %f48
ldd [$key + 0x100-0x40], %f50
des_round %f32, %f34, %f0, %f0
ldd [$key + 0x100-0x48], %f52
ldd [$key + 0x100-0x50], %f54
des_iip %f0, %f0
ldd [$key + 0x100-0x58], %f56
ldd [$key + 0x100-0x60], %f58
des_ip %f0, %f0
ldd [$key + 0x100-0x68], %f60
ldd [$key + 0x100-0x70], %f62
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x100-0x78], %f36
ldd [$key + 0x100-0x80], %f38
des_round %f40, %f42, %f0, %f0
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
ldd [$key + 0x100+0x00], %f40
ldd [$key + 0x100+0x08], %f42
des_round %f52, %f54, %f0, %f0
ldd [$key + 0x100+0x10], %f44
ldd [$key + 0x100+0x18], %f46
des_round %f56, %f58, %f0, %f0
ldd [$key + 0x100+0x20], %f48
ldd [$key + 0x100+0x28], %f50
des_round %f60, %f62, %f0, %f0
ldd [$key + 0x100+0x30], %f52
ldd [$key + 0x100+0x38], %f54
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x100+0x40], %f56
ldd [$key + 0x100+0x48], %f58
des_iip %f0, %f0
ldd [$key + 0x100+0x50], %f60
ldd [$key + 0x100+0x58], %f62
des_ip %f0, %f0
ldd [$key + 0x100+0x60], %f36
ldd [$key + 0x100+0x68], %f38
des_round %f40, %f42, %f0, %f0
ldd [$key + 0x100+0x70], %f40
ldd [$key + 0x100+0x78], %f42
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
des_round %f52, %f54, %f0, %f0
des_round %f56, %f58, %f0, %f0
des_round %f60, %f62, %f0, %f0
des_round %f36, %f38, %f0, %f0
des_round %f40, %f42, %f0, %f0
des_iip %f0, %f0
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_ede3_cbc_enc_loop
add $out, 8, $out
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~2x deterioration
! in inp==out case
faligndata %f0, %f0, %f2 ! handle unaligned output
stda %f2, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f2, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_ede3_cbc_enc_loop+4
orn %g0, $omask, $omask
st %f0, [$ivec + 0] ! write out ivec
retl
st %f1, [$ivec + 4]
.type des_t4_ede3_cbc_encrypt,#function
.size des_t4_ede3_cbc_encrypt,.-des_t4_ede3_cbc_encrypt
.globl des_t4_ede3_cbc_decrypt
.align 32
des_t4_ede3_cbc_decrypt:
ld [$ivec + 0], %f2 ! load ivec
ld [$ivec + 4], %f3
and $inp, 7, $ileft
andn $inp, 7, $inp
sll $ileft, 3, $ileft
mov 0xff, $omask
prefetch [$inp], 20
prefetch [$inp + 63], 20
sub %g0, $ileft, $iright
and $out, 7, %g4
alignaddrl $out, %g0, $out
srl $omask, %g4, $omask
srlx $len, 3, $len
movrz %g4, 0, $omask
prefetch [$out], 22
ldd [$key + 0x100+0x78], %f4 ! load key schedule
ldd [$key + 0x100+0x70], %f6
ldd [$key + 0x100+0x68], %f8
ldd [$key + 0x100+0x60], %f10
ldd [$key + 0x100+0x58], %f12
ldd [$key + 0x100+0x50], %f14
ldd [$key + 0x100+0x48], %f16
ldd [$key + 0x100+0x40], %f18
ldd [$key + 0x100+0x38], %f20
ldd [$key + 0x100+0x30], %f22
ldd [$key + 0x100+0x28], %f24
ldd [$key + 0x100+0x20], %f26
ldd [$key + 0x100+0x18], %f28
ldd [$key + 0x100+0x10], %f30
ldd [$key + 0x100+0x08], %f32
ldd [$key + 0x100+0x00], %f34
.Ldes_ede3_cbc_dec_loop:
ldx [$inp + 0], %g4
brz,pt $ileft, 4f
nop
ldx [$inp + 8], %g5
sllx %g4, $ileft, %g4
srlx %g5, $iright, %g5
or %g5, %g4, %g4
4:
movxtod %g4, %f0
prefetch [$inp + 8+63], 20
add $inp, 8, $inp
prefetch [$out + 63], 22
des_ip %f0, %f0
des_round %f4, %f6, %f0, %f0
des_round %f8, %f10, %f0, %f0
des_round %f12, %f14, %f0, %f0
des_round %f16, %f18, %f0, %f0
ldd [$key + 0x80+0x00], %f36
ldd [$key + 0x80+0x08], %f38
des_round %f20, %f22, %f0, %f0
ldd [$key + 0x80+0x10], %f40
ldd [$key + 0x80+0x18], %f42
des_round %f24, %f26, %f0, %f0
ldd [$key + 0x80+0x20], %f44
ldd [$key + 0x80+0x28], %f46
des_round %f28, %f30, %f0, %f0
ldd [$key + 0x80+0x30], %f48
ldd [$key + 0x80+0x38], %f50
des_round %f32, %f34, %f0, %f0
ldd [$key + 0x80+0x40], %f52
ldd [$key + 0x80+0x48], %f54
des_iip %f0, %f0
ldd [$key + 0x80+0x50], %f56
ldd [$key + 0x80+0x58], %f58
des_ip %f0, %f0
ldd [$key + 0x80+0x60], %f60
ldd [$key + 0x80+0x68], %f62
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x80+0x70], %f36
ldd [$key + 0x80+0x78], %f38
des_round %f40, %f42, %f0, %f0
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
ldd [$key + 0x80-0x08], %f40
ldd [$key + 0x80-0x10], %f42
des_round %f52, %f54, %f0, %f0
ldd [$key + 0x80-0x18], %f44
ldd [$key + 0x80-0x20], %f46
des_round %f56, %f58, %f0, %f0
ldd [$key + 0x80-0x28], %f48
ldd [$key + 0x80-0x30], %f50
des_round %f60, %f62, %f0, %f0
ldd [$key + 0x80-0x38], %f52
ldd [$key + 0x80-0x40], %f54
des_round %f36, %f38, %f0, %f0
ldd [$key + 0x80-0x48], %f56
ldd [$key + 0x80-0x50], %f58
des_iip %f0, %f0
ldd [$key + 0x80-0x58], %f60
ldd [$key + 0x80-0x60], %f62
des_ip %f0, %f0
ldd [$key + 0x80-0x68], %f36
ldd [$key + 0x80-0x70], %f38
des_round %f40, %f42, %f0, %f0
ldd [$key + 0x80-0x78], %f40
ldd [$key + 0x80-0x80], %f42
des_round %f44, %f46, %f0, %f0
des_round %f48, %f50, %f0, %f0
des_round %f52, %f54, %f0, %f0
des_round %f56, %f58, %f0, %f0
des_round %f60, %f62, %f0, %f0
des_round %f36, %f38, %f0, %f0
des_round %f40, %f42, %f0, %f0
des_iip %f0, %f0
fxor %f2, %f0, %f0 ! ^= ivec
movxtod %g4, %f2
brnz,pn $omask, 2f
sub $len, 1, $len
std %f0, [$out + 0]
brnz,pt $len, .Ldes_ede3_cbc_dec_loop
add $out, 8, $out
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.align 16
2: ldxa [$inp]0x82, %g4 ! avoid read-after-write hazard
! and ~3x deterioration
! in inp==out case
faligndata %f0, %f0, %f0 ! handle unaligned output
stda %f0, [$out + $omask]0xc0 ! partial store
add $out, 8, $out
orn %g0, $omask, $omask
stda %f0, [$out + $omask]0xc0 ! partial store
brnz,pt $len, .Ldes_ede3_cbc_dec_loop+4
orn %g0, $omask, $omask
st %f2, [$ivec + 0] ! write out ivec
retl
st %f3, [$ivec + 4]
.type des_t4_ede3_cbc_decrypt,#function
.size des_t4_ede3_cbc_decrypt,.-des_t4_ede3_cbc_decrypt
___
}
$code.=<<___;
.asciz "DES for SPARC T4, David S. Miller, Andy Polyakov"
.align 4
___
&emit_assembler();
close STDOUT;