openssl/crypto/aes/asm/aes-ppc.pl

1190 lines
34 KiB
Raku

#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
# Needs more work: key setup, page boundaries, CBC routine...
#
# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
# 128-bit key, which is ~40% better than 64-bit code generated by gcc
# 4.0. But these are not the ones currently used! Their "compact"
# counterparts are, for security reason. ppc_AES_encrypt_compact runs
# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
# at 1/3 of ppc_AES_decrypt.
# February 2010
#
# Rescheduling instructions to favour Power6 pipeline gives 10%
# performance improvement on the platfrom in question (and marginal
# improvement even on others). It should be noted that Power6 fails
# to process byte in 18 cycles, only in 23, because it fails to issue
# 4 load instructions in two cycles, only in 3. As result non-compact
# block subroutines are 25% slower than one would expect. Compact
# functions scale better, because they have pure computational part,
# which scales perfectly with clock frequency. To be specific
# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=32*$SIZE_T;
sub _data_word()
{ my $i;
while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
}
$sp="r1";
$toc="r2";
$inp="r3";
$out="r4";
$key="r5";
$Tbl0="r3";
$Tbl1="r6";
$Tbl2="r7";
$Tbl3="r2";
$s0="r8";
$s1="r9";
$s2="r10";
$s3="r11";
$t0="r12";
$t1="r13";
$t2="r14";
$t3="r15";
$acc00="r16";
$acc01="r17";
$acc02="r18";
$acc03="r19";
$acc04="r20";
$acc05="r21";
$acc06="r22";
$acc07="r23";
$acc08="r24";
$acc09="r25";
$acc10="r26";
$acc11="r27";
$acc12="r28";
$acc13="r29";
$acc14="r30";
$acc15="r31";
# stay away from TLS pointer
if ($SIZE_T==8) { die if ($t1 ne "r13"); $t1="r0"; }
else { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0"; }
$mask80=$Tbl2;
$mask1b=$Tbl3;
$code.=<<___;
.machine "any"
.text
.align 7
LAES_Te:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvv "distance" between . and 1st data entry
addi $Tbl0,$Tbl0,`128-8`
mtlr r0
blr
.space `32-24`
LAES_Td:
mflr r0
bcl 20,31,\$+4
mflr $Tbl0 ; vvvvvvvv "distance" between . and 1st data entry
addi $Tbl0,$Tbl0,`128-8-32+2048+256`
mtlr r0
blr
.space `128-32-24`
___
&_data_word(
0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
$code.=<<___;
.byte 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
.byte 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
.byte 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
.byte 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
.byte 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
.byte 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
.byte 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
.byte 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
.byte 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
.byte 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
.byte 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
.byte 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
.byte 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
.byte 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
.byte 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
.byte 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
.byte 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
.byte 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
.byte 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
.byte 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
.byte 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
.byte 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
.byte 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
.byte 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
.byte 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
.byte 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
.byte 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
.byte 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
.byte 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
.byte 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
.byte 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
.byte 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
___
&_data_word(
0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
$code.=<<___;
.byte 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
.byte 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
.byte 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
.byte 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
.byte 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
.byte 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
.byte 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
.byte 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
.byte 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
.byte 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
.byte 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
.byte 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
.byte 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
.byte 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
.byte 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
.byte 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
.byte 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
.byte 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
.byte 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
.byte 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
.byte 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
.byte 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
.byte 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
.byte 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
.byte 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
.byte 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
.byte 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
.byte 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
.byte 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
.byte 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
.byte 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
.byte 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
.globl .AES_encrypt
.align 7
.AES_encrypt:
mflr r0
$STU $sp,-$FRAME($sp)
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
lwz $s3,12($inp)
bl LAES_Te
bl Lppc_AES_encrypt_compact
stw $s0,0($out)
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
$POP r0,`$FRAME-$SIZE_T*21`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.align 5
Lppc_AES_encrypt:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
addi $Tbl2,$Tbl0,2
addi $Tbl3,$Tbl0,1
addi $acc00,$acc00,-1
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
mtctr $acc00
.align 4
Lenc_loop:
rlwinm $acc00,$s0,`32-24+3`,21,28
rlwinm $acc01,$s1,`32-24+3`,21,28
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc04,$s1,`32-16+3`,21,28
rlwinm $acc05,$s2,`32-16+3`,21,28
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc06,$s3,`32-16+3`,21,28
rlwinm $acc07,$s0,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0+3`,21,28
rlwinm $acc13,$s0,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0+3`,21,28
rlwinm $acc15,$s2,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
xor $t2,$t2,$acc10
xor $t3,$t3,$acc11
xor $s0,$t0,$acc12
xor $s1,$t1,$acc13
xor $s2,$t2,$acc14
xor $s3,$t3,$acc15
addi $key,$key,16
bdnz- Lenc_loop
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s1,`32-16`,24,31
rlwinm $acc05,$s2,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
lwz $acc12,`2048+128`($Tbl0)
lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
lwz $acc14,`2048+192`($Tbl0)
lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc00,$Tbl2,$acc00
lbzx $acc01,$Tbl2,$acc01
rlwinm $acc12,$s3,`0`,24,31
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc02,$Tbl2,$acc02
lbzx $acc03,$Tbl2,$acc03
rlwinm $acc14,$s1,`0`,24,31
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc04,$Tbl2,$acc04
lbzx $acc05,$Tbl2,$acc05
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
lbzx $acc06,$Tbl2,$acc06
lbzx $acc07,$Tbl2,$acc07
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc08,$Tbl2,$acc08
lbzx $acc09,$Tbl2,$acc09
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
lbzx $acc10,$Tbl2,$acc10
lbzx $acc11,$Tbl2,$acc11
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
lbzx $acc12,$Tbl2,$acc12
lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
lbzx $acc14,$Tbl2,$acc14
lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
or $s0,$s0,$acc12
or $s1,$s1,$acc13
or $s2,$s2,$acc14
or $s3,$s3,$acc15
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.align 4
Lppc_AES_encrypt_compact:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
lis $mask80,0x8080
lis $mask1b,0x1b1b
addi $key,$key,16
ori $mask80,$mask80,0x8080
ori $mask1b,$mask1b,0x1b1b
mtctr $acc00
.align 4
Lenc_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
rlwinm $acc01,$s1,`32-24`,24,31
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s1,`32-16`,24,31
rlwinm $acc05,$s2,`32-16`,24,31
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s3,`0`,24,31
rlwinm $acc13,$s0,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0`,24,31
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
lwz $t1,4($key)
or $s0,$s0,$acc12
or $s1,$s1,$acc13
lwz $t2,8($key)
lwz $t3,12($key)
or $s2,$s2,$acc14
or $s3,$s3,$acc15
addi $key,$key,16
bdz Lenc_compact_done
and $acc00,$s0,$mask80 # r1=r0&0x80808080
and $acc01,$s1,$mask80
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
srwi $acc05,$acc01,7
srwi $acc06,$acc02,7
srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
andc $acc09,$s1,$mask80
andc $acc10,$s2,$mask80
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
sub $acc02,$acc02,$acc06
sub $acc03,$acc03,$acc07
add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
add $acc09,$acc09,$acc09
add $acc10,$acc10,$acc10
add $acc11,$acc11,$acc11
and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc01,$acc01,$mask1b
and $acc02,$acc02,$mask1b
and $acc03,$acc03,$mask1b
xor $acc00,$acc00,$acc08 # r2
xor $acc01,$acc01,$acc09
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
rotlwi $acc12,$s0,16 # ROTATE(r0,16)
rotlwi $acc13,$s1,16
rotlwi $acc14,$s2,16
rotlwi $acc15,$s3,16
xor $s0,$s0,$acc00 # r0^r2
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotrwi $s0,$s0,24 # ROTATE(r2^r0,24)
rotrwi $s1,$s1,24
rotrwi $s2,$s2,24
rotrwi $s3,$s3,24
xor $s0,$s0,$acc00 # ROTATE(r2^r0,24)^r2
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotlwi $acc08,$acc12,8 # ROTATE(r0,24)
rotlwi $acc09,$acc13,8
rotlwi $acc10,$acc14,8
rotlwi $acc11,$acc15,8
xor $s0,$s0,$acc12 #
xor $s1,$s1,$acc13
xor $s2,$s2,$acc14
xor $s3,$s3,$acc15
xor $s0,$s0,$acc08 #
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
b Lenc_compact_loop
.align 4
Lenc_compact_done:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.globl .AES_decrypt
.align 7
.AES_decrypt:
mflr r0
$STU $sp,-$FRAME($sp)
$PUSH r0,`$FRAME-$SIZE_T*21`($sp)
$PUSH $toc,`$FRAME-$SIZE_T*20`($sp)
$PUSH r13,`$FRAME-$SIZE_T*19`($sp)
$PUSH r14,`$FRAME-$SIZE_T*18`($sp)
$PUSH r15,`$FRAME-$SIZE_T*17`($sp)
$PUSH r16,`$FRAME-$SIZE_T*16`($sp)
$PUSH r17,`$FRAME-$SIZE_T*15`($sp)
$PUSH r18,`$FRAME-$SIZE_T*14`($sp)
$PUSH r19,`$FRAME-$SIZE_T*13`($sp)
$PUSH r20,`$FRAME-$SIZE_T*12`($sp)
$PUSH r21,`$FRAME-$SIZE_T*11`($sp)
$PUSH r22,`$FRAME-$SIZE_T*10`($sp)
$PUSH r23,`$FRAME-$SIZE_T*9`($sp)
$PUSH r24,`$FRAME-$SIZE_T*8`($sp)
$PUSH r25,`$FRAME-$SIZE_T*7`($sp)
$PUSH r26,`$FRAME-$SIZE_T*6`($sp)
$PUSH r27,`$FRAME-$SIZE_T*5`($sp)
$PUSH r28,`$FRAME-$SIZE_T*4`($sp)
$PUSH r29,`$FRAME-$SIZE_T*3`($sp)
$PUSH r30,`$FRAME-$SIZE_T*2`($sp)
$PUSH r31,`$FRAME-$SIZE_T*1`($sp)
lwz $s0,0($inp)
lwz $s1,4($inp)
lwz $s2,8($inp)
lwz $s3,12($inp)
bl LAES_Td
bl Lppc_AES_decrypt_compact
stw $s0,0($out)
stw $s1,4($out)
stw $s2,8($out)
stw $s3,12($out)
$POP r0,`$FRAME-$SIZE_T*21`($sp)
$POP $toc,`$FRAME-$SIZE_T*20`($sp)
$POP r13,`$FRAME-$SIZE_T*19`($sp)
$POP r14,`$FRAME-$SIZE_T*18`($sp)
$POP r15,`$FRAME-$SIZE_T*17`($sp)
$POP r16,`$FRAME-$SIZE_T*16`($sp)
$POP r17,`$FRAME-$SIZE_T*15`($sp)
$POP r18,`$FRAME-$SIZE_T*14`($sp)
$POP r19,`$FRAME-$SIZE_T*13`($sp)
$POP r20,`$FRAME-$SIZE_T*12`($sp)
$POP r21,`$FRAME-$SIZE_T*11`($sp)
$POP r22,`$FRAME-$SIZE_T*10`($sp)
$POP r23,`$FRAME-$SIZE_T*9`($sp)
$POP r24,`$FRAME-$SIZE_T*8`($sp)
$POP r25,`$FRAME-$SIZE_T*7`($sp)
$POP r26,`$FRAME-$SIZE_T*6`($sp)
$POP r27,`$FRAME-$SIZE_T*5`($sp)
$POP r28,`$FRAME-$SIZE_T*4`($sp)
$POP r29,`$FRAME-$SIZE_T*3`($sp)
$POP r30,`$FRAME-$SIZE_T*2`($sp)
$POP r31,`$FRAME-$SIZE_T*1`($sp)
mtlr r0
addi $sp,$sp,$FRAME
blr
.align 5
Lppc_AES_decrypt:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,3
addi $Tbl2,$Tbl0,2
addi $Tbl3,$Tbl0,1
addi $acc00,$acc00,-1
addi $key,$key,16
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
mtctr $acc00
.align 4
Ldec_loop:
rlwinm $acc00,$s0,`32-24+3`,21,28
rlwinm $acc01,$s1,`32-24+3`,21,28
rlwinm $acc02,$s2,`32-24+3`,21,28
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc04,$s3,`32-16+3`,21,28
rlwinm $acc05,$s0,`32-16+3`,21,28
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc06,$s1,`32-16+3`,21,28
rlwinm $acc07,$s2,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
lwzx $acc01,$Tbl0,$acc01
rlwinm $acc08,$s2,`32-8+3`,21,28
rlwinm $acc09,$s3,`32-8+3`,21,28
lwzx $acc02,$Tbl0,$acc02
lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
lwzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0+3`,21,28
rlwinm $acc13,$s2,`0+3`,21,28
lwzx $acc06,$Tbl1,$acc06
lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0+3`,21,28
rlwinm $acc15,$s0,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
lwzx $acc09,$Tbl2,$acc09
xor $t0,$t0,$acc00
xor $t1,$t1,$acc01
lwzx $acc10,$Tbl2,$acc10
lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
lwzx $acc13,$Tbl3,$acc13
xor $t0,$t0,$acc04
xor $t1,$t1,$acc05
lwzx $acc14,$Tbl3,$acc14
lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
xor $t1,$t1,$acc09
xor $t2,$t2,$acc10
xor $t3,$t3,$acc11
xor $s0,$t0,$acc12
xor $s1,$t1,$acc13
xor $s2,$t2,$acc14
xor $s3,$t3,$acc15
addi $key,$key,16
bdnz- Ldec_loop
addi $Tbl2,$Tbl0,2048
nop
lwz $t0,0($key)
lwz $t1,4($key)
rlwinm $acc00,$s0,`32-24`,24,31
rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
lwz $t3,12($key)
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s3,`32-16`,24,31
rlwinm $acc05,$s0,`32-16`,24,31
lwz $acc10,`2048+64`($Tbl0)
lwz $acc11,`2048+96`($Tbl0)
lbzx $acc00,$Tbl2,$acc00
lbzx $acc01,$Tbl2,$acc01
lwz $acc12,`2048+128`($Tbl0)
lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc06,$s1,`32-16`,24,31
rlwinm $acc07,$s2,`32-16`,24,31
lwz $acc14,`2048+192`($Tbl0)
lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl2,$acc02
lbzx $acc03,$Tbl2,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl2,$acc04
lbzx $acc05,$Tbl2,$acc05
rlwinm $acc12,$s1,`0`,24,31
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl2,$acc06
lbzx $acc07,$Tbl2,$acc07
rlwinm $acc14,$s3,`0`,24,31
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl2,$acc08
lbzx $acc09,$Tbl2,$acc09
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl2,$acc10
lbzx $acc11,$Tbl2,$acc11
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl2,$acc12
lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl2,$acc14
lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
or $s0,$s0,$acc12
or $s1,$s1,$acc13
or $s2,$s2,$acc14
or $s3,$s3,$acc15
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.align 4
Lppc_AES_decrypt_compact:
lwz $acc00,240($key)
lwz $t0,0($key)
lwz $t1,4($key)
lwz $t2,8($key)
lwz $t3,12($key)
addi $Tbl1,$Tbl0,2048
lis $mask80,0x8080
lis $mask1b,0x1b1b
addi $key,$key,16
ori $mask80,$mask80,0x8080
ori $mask1b,$mask1b,0x1b1b
___
$code.=<<___ if ($SIZE_T==8);
insrdi $mask80,$mask80,32,0
insrdi $mask1b,$mask1b,32,0
___
$code.=<<___;
mtctr $acc00
.align 4
Ldec_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
rlwinm $acc00,$s0,`32-24`,24,31
rlwinm $acc01,$s1,`32-24`,24,31
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
rlwinm $acc04,$s3,`32-16`,24,31
rlwinm $acc05,$s0,`32-16`,24,31
rlwinm $acc06,$s1,`32-16`,24,31
rlwinm $acc07,$s2,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
lbzx $acc01,$Tbl1,$acc01
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
lbzx $acc02,$Tbl1,$acc02
lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
lbzx $acc05,$Tbl1,$acc05
rlwinm $acc12,$s1,`0`,24,31
rlwinm $acc13,$s2,`0`,24,31
lbzx $acc06,$Tbl1,$acc06
lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0`,24,31
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
lbzx $acc09,$Tbl1,$acc09
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
lbzx $acc10,$Tbl1,$acc10
lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
lbzx $acc13,$Tbl1,$acc13
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
lbzx $acc14,$Tbl1,$acc14
lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
lwz $t0,0($key)
lwz $t1,4($key)
or $s0,$s0,$acc12
or $s1,$s1,$acc13
lwz $t2,8($key)
lwz $t3,12($key)
or $s2,$s2,$acc14
or $s3,$s3,$acc15
addi $key,$key,16
bdz Ldec_compact_done
___
$code.=<<___ if ($SIZE_T==8);
# vectorized permutation improves decrypt performance by 10%
insrdi $s0,$s1,32,0
insrdi $s2,$s3,32,0
and $acc00,$s0,$mask80 # r1=r0&0x80808080
and $acc02,$s2,$mask80
srdi $acc04,$acc00,7 # r1>>7
srdi $acc06,$acc02,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
andc $acc10,$s2,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc02,$acc02,$acc06
add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
add $acc10,$acc10,$acc10
and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc02,$acc02,$mask1b
xor $acc00,$acc00,$acc08 # r2
xor $acc02,$acc02,$acc10
and $acc04,$acc00,$mask80 # r1=r2&0x80808080
and $acc06,$acc02,$mask80
srdi $acc08,$acc04,7 # r1>>7
srdi $acc10,$acc06,7
andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
andc $acc14,$acc02,$mask80
sub $acc04,$acc04,$acc08 # r1-(r1>>7)
sub $acc06,$acc06,$acc10
add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
add $acc14,$acc14,$acc14
and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc06,$acc06,$mask1b
xor $acc04,$acc04,$acc12 # r4
xor $acc06,$acc06,$acc14
and $acc08,$acc04,$mask80 # r1=r4&0x80808080
and $acc10,$acc06,$mask80
srdi $acc12,$acc08,7 # r1>>7
srdi $acc14,$acc10,7
sub $acc08,$acc08,$acc12 # r1-(r1>>7)
sub $acc10,$acc10,$acc14
andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
andc $acc14,$acc06,$mask80
add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
add $acc14,$acc14,$acc14
and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc10,$acc10,$mask1b
xor $acc08,$acc08,$acc12 # r8
xor $acc10,$acc10,$acc14
xor $acc00,$acc00,$s0 # r2^r0
xor $acc02,$acc02,$s2
xor $acc04,$acc04,$s0 # r4^r0
xor $acc06,$acc06,$s2
extrdi $acc01,$acc00,32,0
extrdi $acc03,$acc02,32,0
extrdi $acc05,$acc04,32,0
extrdi $acc07,$acc06,32,0
extrdi $acc09,$acc08,32,0
extrdi $acc11,$acc10,32,0
___
$code.=<<___ if ($SIZE_T==4);
and $acc00,$s0,$mask80 # r1=r0&0x80808080
and $acc01,$s1,$mask80
and $acc02,$s2,$mask80
and $acc03,$s3,$mask80
srwi $acc04,$acc00,7 # r1>>7
srwi $acc05,$acc01,7
srwi $acc06,$acc02,7
srwi $acc07,$acc03,7
andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
andc $acc09,$s1,$mask80
andc $acc10,$s2,$mask80
andc $acc11,$s3,$mask80
sub $acc00,$acc00,$acc04 # r1-(r1>>7)
sub $acc01,$acc01,$acc05
sub $acc02,$acc02,$acc06
sub $acc03,$acc03,$acc07
add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
add $acc09,$acc09,$acc09
add $acc10,$acc10,$acc10
add $acc11,$acc11,$acc11
and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc01,$acc01,$mask1b
and $acc02,$acc02,$mask1b
and $acc03,$acc03,$mask1b
xor $acc00,$acc00,$acc08 # r2
xor $acc01,$acc01,$acc09
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
and $acc04,$acc00,$mask80 # r1=r2&0x80808080
and $acc05,$acc01,$mask80
and $acc06,$acc02,$mask80
and $acc07,$acc03,$mask80
srwi $acc08,$acc04,7 # r1>>7
srwi $acc09,$acc05,7
srwi $acc10,$acc06,7
srwi $acc11,$acc07,7
andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
andc $acc13,$acc01,$mask80
andc $acc14,$acc02,$mask80
andc $acc15,$acc03,$mask80
sub $acc04,$acc04,$acc08 # r1-(r1>>7)
sub $acc05,$acc05,$acc09
sub $acc06,$acc06,$acc10
sub $acc07,$acc07,$acc11
add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
add $acc13,$acc13,$acc13
add $acc14,$acc14,$acc14
add $acc15,$acc15,$acc15
and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc05,$acc05,$mask1b
and $acc06,$acc06,$mask1b
and $acc07,$acc07,$mask1b
xor $acc04,$acc04,$acc12 # r4
xor $acc05,$acc05,$acc13
xor $acc06,$acc06,$acc14
xor $acc07,$acc07,$acc15
and $acc08,$acc04,$mask80 # r1=r4&0x80808080
and $acc09,$acc05,$mask80
and $acc10,$acc06,$mask80
and $acc11,$acc07,$mask80
srwi $acc12,$acc08,7 # r1>>7
srwi $acc13,$acc09,7
srwi $acc14,$acc10,7
srwi $acc15,$acc11,7
sub $acc08,$acc08,$acc12 # r1-(r1>>7)
sub $acc09,$acc09,$acc13
sub $acc10,$acc10,$acc14
sub $acc11,$acc11,$acc15
andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
andc $acc13,$acc05,$mask80
andc $acc14,$acc06,$mask80
andc $acc15,$acc07,$mask80
add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
add $acc13,$acc13,$acc13
add $acc14,$acc14,$acc14
add $acc15,$acc15,$acc15
and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
and $acc09,$acc09,$mask1b
and $acc10,$acc10,$mask1b
and $acc11,$acc11,$mask1b
xor $acc08,$acc08,$acc12 # r8
xor $acc09,$acc09,$acc13
xor $acc10,$acc10,$acc14
xor $acc11,$acc11,$acc15
xor $acc00,$acc00,$s0 # r2^r0
xor $acc01,$acc01,$s1
xor $acc02,$acc02,$s2
xor $acc03,$acc03,$s3
xor $acc04,$acc04,$s0 # r4^r0
xor $acc05,$acc05,$s1
xor $acc06,$acc06,$s2
xor $acc07,$acc07,$s3
___
$code.=<<___;
rotrwi $s0,$s0,8 # = ROTATE(r0,8)
rotrwi $s1,$s1,8
rotrwi $s2,$s2,8
rotrwi $s3,$s3,8
xor $s0,$s0,$acc00 # ^= r2^r0
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
xor $acc00,$acc00,$acc08
xor $acc01,$acc01,$acc09
xor $acc02,$acc02,$acc10
xor $acc03,$acc03,$acc11
xor $s0,$s0,$acc04 # ^= r4^r0
xor $s1,$s1,$acc05
xor $s2,$s2,$acc06
xor $s3,$s3,$acc07
rotrwi $acc00,$acc00,24
rotrwi $acc01,$acc01,24
rotrwi $acc02,$acc02,24
rotrwi $acc03,$acc03,24
xor $acc04,$acc04,$acc08
xor $acc05,$acc05,$acc09
xor $acc06,$acc06,$acc10
xor $acc07,$acc07,$acc11
xor $s0,$s0,$acc08 # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
rotrwi $acc04,$acc04,16
rotrwi $acc05,$acc05,16
rotrwi $acc06,$acc06,16
rotrwi $acc07,$acc07,16
xor $s0,$s0,$acc00 # ^= ROTATE(r8^r2^r0,24)
xor $s1,$s1,$acc01
xor $s2,$s2,$acc02
xor $s3,$s3,$acc03
rotrwi $acc08,$acc08,8
rotrwi $acc09,$acc09,8
rotrwi $acc10,$acc10,8
rotrwi $acc11,$acc11,8
xor $s0,$s0,$acc04 # ^= ROTATE(r8^r4^r0,16)
xor $s1,$s1,$acc05
xor $s2,$s2,$acc06
xor $s3,$s3,$acc07
xor $s0,$s0,$acc08 # ^= ROTATE(r8,8)
xor $s1,$s1,$acc09
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
b Ldec_compact_loop
.align 4
Ldec_compact_done:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
blr
.long 0
.asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
.align 7
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;