aes/asm/vpaes-ppc.pl: add little-endian support.

This commit is contained in:
Andy Polyakov 2014-01-07 16:46:25 +01:00
parent f0170ebb97
commit 1fb83a3bc2
3 changed files with 161 additions and 131 deletions

View File

@ -365,7 +365,7 @@ my %table=(
#### ####
"linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-generic64","gcc:-DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ppc64", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc64_asm}:linux64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "linux-ppc64", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${ppc64_asm}:linux64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
"linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:".eval{my $asm=$ppc64_asm;$asm=~s/vpaes\-ppc\.o//;$asm}.":linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::", "linux-ppc64le","gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:$ppc64_asm:linux64le:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::",
"linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64", "gcc:-DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall::-D_REENTRANT::-ldl -no_cpprt:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)", "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall::-D_REENTRANT::-ldl -no_cpprt:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64", "linux-x86_64", "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",

2
TABLE
View File

@ -4532,7 +4532,7 @@ $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL
$cpuid_obj = ppccpuid.o ppccap.o $cpuid_obj = ppccpuid.o ppccap.o
$bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o $bn_obj = bn-ppc.o ppc-mont.o ppc64-mont.o
$des_obj = $des_obj =
$aes_obj = aes_core.o aes_cbc.o aes-ppc.o $aes_obj = aes_core.o aes_cbc.o aes-ppc.o vpaes-ppc.o
$bf_obj = $bf_obj =
$md5_obj = $md5_obj =
$sha1_obj = sha1-ppc.o sha256-ppc.o sha512-ppc.o $sha1_obj = sha1-ppc.o sha256-ppc.o sha512-ppc.o

View File

@ -61,89 +61,89 @@ $code.=<<___;
.align 7 # totally strategic alignment .align 7 # totally strategic alignment
_vpaes_consts: _vpaes_consts:
Lk_mc_forward: # mc_forward Lk_mc_forward: # mc_forward
.long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
.long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
.long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
.long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
Lk_mc_backward: # mc_backward Lk_mc_backward: # mc_backward
.long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
.long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
.long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
.long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
Lk_sr: # sr Lk_sr: # sr
.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
.long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
.long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
.long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
## ##
## "Hot" constants ## "Hot" constants
## ##
Lk_inv: # inv, inva Lk_inv: # inv, inva
.long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
.long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
Lk_ipt: # input transform (lo, hi) Lk_ipt: # input transform (lo, hi)
.long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
.long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
Lk_sbo: # sbou, sbot Lk_sbo: # sbou, sbot
.long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
.long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
Lk_sb1: # sb1u, sb1t Lk_sb1: # sb1u, sb1t
.long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
.long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
Lk_sb2: # sb2u, sb2t Lk_sb2: # sb2u, sb2t
.long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
.long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
## ##
## Decryption stuff ## Decryption stuff
## ##
Lk_dipt: # decryption input transform Lk_dipt: # decryption input transform
.long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
.long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
Lk_dsbo: # decryption sbox final output Lk_dsbo: # decryption sbox final output
.long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
.long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
Lk_dsb9: # decryption sbox output *9*u, *9*t Lk_dsb9: # decryption sbox output *9*u, *9*t
.long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
.long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
Lk_dsbd: # decryption sbox output *D*u, *D*t Lk_dsbd: # decryption sbox output *D*u, *D*t
.long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
.long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
Lk_dsbb: # decryption sbox output *B*u, *B*t Lk_dsbb: # decryption sbox output *B*u, *B*t
.long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
.long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
Lk_dsbe: # decryption sbox output *E*u, *E*t Lk_dsbe: # decryption sbox output *E*u, *E*t
.long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
.long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
## ##
## Key schedule constants ## Key schedule constants
## ##
Lk_dksd: # decryption key schedule: invskew x*D Lk_dksd: # decryption key schedule: invskew x*D
.long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
.long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
Lk_dksb: # decryption key schedule: invskew x*B Lk_dksb: # decryption key schedule: invskew x*B
.long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
.long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
Lk_dkse: # decryption key schedule: invskew x*E + 0x63 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
.long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
.long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
Lk_dks9: # decryption key schedule: invskew x*9 Lk_dks9: # decryption key schedule: invskew x*9
.long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
.long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
Lk_rcon: # rcon Lk_rcon: # rcon
.long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
Lk_s63: Lk_s63:
.long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
Lk_opt: # output transform Lk_opt: # output transform
.long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
.long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
Lk_deskew: # deskew tables: inverts the sbox's "skew" Lk_deskew: # deskew tables: inverts the sbox's "skew"
.long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
.long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
.align 5 .align 5
Lconsts: Lconsts:
mflr r0 mflr r0
@ -227,7 +227,7 @@ _vpaes_encrypt_core:
li r11, 0x10 li r11, 0x10
lvx v6, r9, $key lvx v6, r9, $key
addi r9, r9, 16 addi r9, r9, 16
vperm v5, v5, v6, $keyperm # align round key ?vperm v5, v5, v6, $keyperm # align round key
addi r10, r11, 0x40 addi r10, r11, 0x40
vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
@ -275,7 +275,7 @@ Lenc_entry:
vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
addi r9, r9, 16 addi r9, r9, 16
vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
vperm v5, v5, v6, $keyperm # align round key ?vperm v5, v5, v6, $keyperm # align round key
vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
bdnz Lenc_loop bdnz Lenc_loop
@ -330,25 +330,20 @@ Lenc_entry:
bl _vpaes_encrypt_preheat bl _vpaes_encrypt_preheat
neg r8, $inp # prepare for unaligned access ?lvsl $inpperm, 0, $inp # prepare for unaligned access
lvsl $keyperm, 0, $key lvx v0, 0, $inp
lvsr $outperm, 0, $out
lvsr $inpperm, 0, r8 # -$inp
vnor $outmask, v7, v7 # 0xff..ff
lvx $inptail, 0, $inp
vperm $outmask, v7, $outmask, $outperm
addi $inp, $inp, 15 # 15 is not a typo addi $inp, $inp, 15 # 15 is not a typo
lvx $outhead, 0, $out ?lvsr $outperm, 0, $out
?lvsl $keyperm, 0, $key # prepare for unaligned access
######## vnor $outmask, v7, v7 # 0xff..ff
vmr v0, $inptail
lvx $inptail, 0, $inp # redundant in aligned case lvx $inptail, 0, $inp # redundant in aligned case
addi $inp, $inp, 16 ?vperm $outmask, v7, $outmask, $outperm
vperm v0, v0, $inptail, $inpperm lvx $outhead, 0, $out
?vperm v0, v0, $inptail, $inpperm
bl _vpaes_encrypt_core bl _vpaes_encrypt_core
vperm v0, v0, v0, $outperm # rotate left vperm v0, v0, v0, $outperm # rotate right/left
vsel v1, $outhead, v0, $outmask vsel v1, $outhead, v0, $outmask
vmr $outhead, v0 vmr $outhead, v0
stvx v1, 0, $out stvx v1, 0, $out
@ -445,7 +440,7 @@ _vpaes_decrypt_core:
li r11, 0x30 li r11, 0x30
lvx v6, r9, $key lvx v6, r9, $key
addi r9, r9, 16 addi r9, r9, 16
vperm v5, v5, v6, $keyperm # align round key ?vperm v5, v5, v6, $keyperm # align round key
vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
@ -509,7 +504,7 @@ Ldec_entry:
vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
addi r9, r9, 16 addi r9, r9, 16
vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
vperm v5, v5, v6, $keyperm # align round key ?vperm v5, v5, v6, $keyperm # align round key
vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
bdnz Ldec_loop bdnz Ldec_loop
@ -564,25 +559,20 @@ Ldec_entry:
bl _vpaes_decrypt_preheat bl _vpaes_decrypt_preheat
neg r8, $inp # prepare for unaligned access ?lvsl $inpperm, 0, $inp # prepare for unaligned access
lvsl $keyperm, 0, $key lvx v0, 0, $inp
lvsr $outperm, 0, $out
lvsr $inpperm, 0, r8 # -$inp
vnor $outmask, v7, v7 # 0xff..ff
lvx $inptail, 0, $inp
vperm $outmask, v7, $outmask, $outperm
addi $inp, $inp, 15 # 15 is not a typo addi $inp, $inp, 15 # 15 is not a typo
lvx $outhead, 0, $out ?lvsr $outperm, 0, $out
?lvsl $keyperm, 0, $key
######## vnor $outmask, v7, v7 # 0xff..ff
vmr v0, $inptail
lvx $inptail, 0, $inp # redundant in aligned case lvx $inptail, 0, $inp # redundant in aligned case
addi $inp, $inp, 16 ?vperm $outmask, v7, $outmask, $outperm
vperm v0, v0, $inptail, $inpperm lvx $outhead, 0, $out
?vperm v0, v0, $inptail, $inpperm
bl _vpaes_decrypt_core bl _vpaes_decrypt_core
vperm v0, v0, v0, $outperm # rotate left vperm v0, v0, v0, $outperm # rotate right/left
vsel v1, $outhead, v0, $outmask vsel v1, $outhead, v0, $outmask
vmr $outhead, v0 vmr $outhead, v0
stvx v1, 0, $out stvx v1, 0, $out
@ -673,18 +663,18 @@ Ldec_entry:
lvx v24, 0, r31 # load [potentially unaligned] iv lvx v24, 0, r31 # load [potentially unaligned] iv
li r9, 15 li r9, 15
lvsl $inpperm, 0, r31 ?lvsl $inpperm, 0, r31
lvx v25, r9, r31 lvx v25, r9, r31
vperm v24, v24, v25, $inpperm ?vperm v24, v24, v25, $inpperm
neg r8, $inp # prepare for unaligned access neg r8, $inp # prepare for unaligned access
vxor v7, v7, v7 vxor v7, v7, v7
lvsl $keyperm, 0, $key ?lvsl $keyperm, 0, $key
lvsr $outperm, 0, $out ?lvsr $outperm, 0, $out
lvsr $inpperm, 0, r8 # -$inp ?lvsr $inpperm, 0, r8 # -$inp
vnor $outmask, v7, v7 # 0xff..ff vnor $outmask, v7, v7 # 0xff..ff
lvx $inptail, 0, $inp lvx $inptail, 0, $inp
vperm $outmask, v7, $outmask, $outperm ?vperm $outmask, v7, $outmask, $outperm
addi $inp, $inp, 15 # 15 is not a typo addi $inp, $inp, 15 # 15 is not a typo
lvx $outhead, 0, $out lvx $outhead, 0, $out
@ -697,14 +687,14 @@ Lcbc_enc_loop:
vmr v0, $inptail vmr v0, $inptail
lvx $inptail, 0, $inp lvx $inptail, 0, $inp
addi $inp, $inp, 16 addi $inp, $inp, 16
vperm v0, v0, $inptail, $inpperm ?vperm v0, v0, $inptail, $inpperm
vxor v0, v0, v24 # ^= iv vxor v0, v0, v24 # ^= iv
bl _vpaes_encrypt_core bl _vpaes_encrypt_core
vmr v24, v0 # put aside iv vmr v24, v0 # put aside iv
sub. r30, r30, r0 # len -= 16 sub. r30, r30, r0 # len -= 16
vperm v0, v0, v0, $outperm # rotate left vperm v0, v0, v0, $outperm # rotate right/left
vsel v1, $outhead, v0, $outmask vsel v1, $outhead, v0, $outmask
vmr $outhead, v0 vmr $outhead, v0
stvx v1, 0, $out stvx v1, 0, $out
@ -722,7 +712,7 @@ Lcbc_dec_loop:
vmr v0, $inptail vmr v0, $inptail
lvx $inptail, 0, $inp lvx $inptail, 0, $inp
addi $inp, $inp, 16 addi $inp, $inp, 16
vperm v0, v0, $inptail, $inpperm ?vperm v0, v0, $inptail, $inpperm
vmr v25, v0 # put aside input vmr v25, v0 # put aside input
bl _vpaes_decrypt_core bl _vpaes_decrypt_core
@ -730,7 +720,7 @@ Lcbc_dec_loop:
vxor v0, v0, v24 # ^= iv vxor v0, v0, v24 # ^= iv
vmr v24, v25 vmr v24, v25
sub. r30, r30, r0 # len -= 16 sub. r30, r30, r0 # len -= 16
vperm v0, v0, v0, $outperm # rotate left vperm v0, v0, v0, $outperm # rotate right/left
vsel v1, $outhead, v0, $outmask vsel v1, $outhead, v0, $outmask
vmr $outhead, v0 vmr $outhead, v0
stvx v1, 0, $out stvx v1, 0, $out
@ -744,12 +734,12 @@ Lcbc_done:
stvx v1, 0, $out stvx v1, 0, $out
neg r8, r31 # write [potentially unaligned] iv neg r8, r31 # write [potentially unaligned] iv
lvsl $outperm, 0, r8 ?lvsl $outperm, 0, r8
li r6, 15 li r6, 15
vnor $outmask, v7, v7 # 0xff..ff vnor $outmask, v7, v7 # 0xff..ff
vperm $outmask, v7, $outmask, $outperm ?vperm $outmask, v7, $outmask, $outperm
lvx $outhead, 0, r31 lvx $outhead, 0, r31
vperm v24, v24, v24, $outperm # rotate vperm v24, v24, v24, $outperm # rotate right/left
vsel v0, $outhead, v24, $outmask vsel v0, $outhead, v24, $outmask
lvx v1, r6, r31 lvx v1, r6, r31
stvx v0, 0, r31 stvx v0, 0, r31
@ -863,10 +853,10 @@ _vpaes_schedule_core:
neg r8, $inp # prepare for unaligned access neg r8, $inp # prepare for unaligned access
lvx v0, 0, $inp lvx v0, 0, $inp
addi $inp, $inp, 15 # 15 is not typo addi $inp, $inp, 15 # 15 is not typo
lvsr $inpperm, 0, r8 # -$inp ?lvsr $inpperm, 0, r8 # -$inp
lvx v6, 0, $inp # v6 serves as inptail lvx v6, 0, $inp # v6 serves as inptail
addi $inp, $inp, 8 addi $inp, $inp, 8
vperm v0, v0, v6, $inpperm ?vperm v0, v0, v6, $inpperm
# input transform # input transform
vmr v3, v0 # vmovdqa %xmm0, %xmm3 vmr v3, v0 # vmovdqa %xmm0, %xmm3
@ -879,13 +869,13 @@ _vpaes_schedule_core:
li r8, 0x30 # mov \$0x30,%r8d li r8, 0x30 # mov \$0x30,%r8d
addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
lvsr $outperm, 0, $out # prepare for unaligned access ?lvsr $outperm, 0, $out # prepare for unaligned access
vspltisb $outmask, -1 # 0xff..ff vnor $outmask, v9, v9 # 0xff..ff
lvx $outhead, 0, $out lvx $outhead, 0, $out
vperm $outmask, v9, $outmask, $outperm ?vperm $outmask, v9, $outmask, $outperm
#stvx v0, 0, $out # vmovdqu %xmm0, (%rdx) #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
vperm v1, v0, v0, $outperm # rotate left vperm v1, v0, v0, $outperm # rotate right/left
vsel v2, $outhead, v1, $outmask vsel v2, $outhead, v1, $outmask
vmr $outhead, v1 vmr $outhead, v1
stvx v2, 0, $out stvx v2, 0, $out
@ -901,14 +891,14 @@ Lschedule_am_decrypting:
vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
neg r0, $out # prepare for unaligned access neg r0, $out # prepare for unaligned access
lvsl $outperm, 0, r0 ?lvsl $outperm, 0, r0
addi $out, $out, 15 # 15 is not typo addi $out, $out, 15 # 15 is not typo
vspltisb $outmask, -1 # 0xff..ff vnor $outmask, v9, v9 # 0xff..ff
lvx $outhead, 0, $out lvx $outhead, 0, $out
vperm $outmask, $outmask, v9, $outperm ?vperm $outmask, $outmask, v9, $outperm
#stvx v4, 0, $out # vmovdqu %xmm3, (%rdx) #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
vperm v4, v4, v4, $outperm # rotate left vperm v4, v4, v4, $outperm # rotate right/left
vsel v2, $outhead, v4, $outmask vsel v2, $outhead, v4, $outmask
vmr $outhead, v4 vmr $outhead, v4
stvx v2, 0, $out stvx v2, 0, $out
@ -957,16 +947,16 @@ Loop_schedule_128:
Lschedule_192: Lschedule_192:
li r0, 4 # mov \$4, %esi li r0, 4 # mov \$4, %esi
lvx v0, 0, $inp lvx v0, 0, $inp
vperm v0, v6, v0, $inpperm ?vperm v0, v6, v0, $inpperm
vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
bl _vpaes_schedule_transform # input transform bl _vpaes_schedule_transform # input transform
vsldoi v6, v0, v9, 8 ?vsldoi v6, v0, v9, 8
vsldoi v6, v9, v6, 8 # clobber "low" side with zeros ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
mtctr r0 mtctr r0
Loop_schedule_192: Loop_schedule_192:
bl _vpaes_schedule_round bl _vpaes_schedule_round
vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
bl _vpaes_schedule_mangle # save key n bl _vpaes_schedule_mangle # save key n
bl _vpaes_schedule_192_smear bl _vpaes_schedule_192_smear
bl _vpaes_schedule_mangle # save key n+1 bl _vpaes_schedule_mangle # save key n+1
@ -991,7 +981,7 @@ Lschedule_256:
li r0, 7 # mov \$7, %esi li r0, 7 # mov \$7, %esi
addi $inp, $inp, 8 addi $inp, $inp, 8
lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
vperm v0, v6, v0, $inpperm ?vperm v0, v6, v0, $inpperm
bl _vpaes_schedule_transform # input transform bl _vpaes_schedule_transform # input transform
mtctr r0 mtctr r0
@ -1005,7 +995,7 @@ Loop_schedule_256:
bl _vpaes_schedule_mangle bl _vpaes_schedule_mangle
# low round. swap xmm7 and xmm6 # low round. swap xmm7 and xmm6
vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
vmr v5, v7 # vmovdqa %xmm7, %xmm5 vmr v5, v7 # vmovdqa %xmm7, %xmm5
vmr v7, v6 # vmovdqa %xmm6, %xmm7 vmr v7, v6 # vmovdqa %xmm6, %xmm7
bl _vpaes_schedule_low_round bl _vpaes_schedule_low_round
@ -1042,7 +1032,7 @@ Lschedule_mangle_last:
bl _vpaes_schedule_transform # output transform bl _vpaes_schedule_transform # output transform
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
vperm v0, v0, v0, $outperm # rotate left vperm v0, v0, v0, $outperm # rotate right/left
vsel v2, $outhead, v0, $outmask vsel v2, $outhead, v0, $outmask
vmr $outhead, v0 vmr $outhead, v0
stvx v2, 0, $out stvx v2, 0, $out
@ -1062,7 +1052,7 @@ Lschedule_mangle_last_dec:
bl _vpaes_schedule_transform # output transform bl _vpaes_schedule_transform # output transform
#stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
vperm v0, v0, v0, $outperm # rotate left vperm v0, v0, v0, $outperm # rotate right/left
vsel v2, $outhead, v0, $outmask vsel v2, $outhead, v0, $outmask
vmr $outhead, v0 vmr $outhead, v0
stvx v2, 0, $out stvx v2, 0, $out
@ -1104,14 +1094,14 @@ Lschedule_mangle_done:
## ##
.align 4 .align 4
_vpaes_schedule_192_smear: _vpaes_schedule_192_smear:
vspltw v0, v7, 3 ?vspltw v0, v7, 3
vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
vmr v0, v6 vmr v0, v6
vsldoi v6, v6, v9, 8 ?vsldoi v6, v6, v9, 8
vsldoi v6, v9, v6, 8 # clobber low side with zeros ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
blr blr
.long 0 .long 0
.byte 0,12,0x14,0,0,0,0,0 .byte 0,12,0x14,0,0,0,0,0
@ -1138,23 +1128,23 @@ _vpaes_schedule_192_smear:
_vpaes_schedule_round: _vpaes_schedule_round:
# extract rcon from xmm8 # extract rcon from xmm8
#vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
# rotate # rotate
vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
# fall through... # fall through...
# low round: same as high round, but no rotation and no rcon. # low round: same as high round, but no rotation and no rcon.
_vpaes_schedule_low_round: _vpaes_schedule_low_round:
# smear xmm7 # smear xmm7
vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
vspltisb v1, 0x0f # 0x0f..0f vspltisb v1, 0x0f # 0x0f..0f
vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
# subbytes # subbytes
vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
@ -1248,7 +1238,7 @@ _vpaes_schedule_mangle:
andi. r8, r8, 0x30 # and \$0x30, %r8 andi. r8, r8, 0x30 # and \$0x30, %r8
#stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
vperm v1, v3, v3, $outperm # rotate left vperm v1, v3, v3, $outperm # rotate right/left
vsel v2, $outhead, v1, $outmask vsel v2, $outhead, v1, $outmask
vmr $outhead, v1 vmr $outhead, v1
stvx v2, 0, $out stvx v2, 0, $out
@ -1299,7 +1289,7 @@ Lschedule_mangle_dec:
andi. r8, r8, 0x30 # and \$0x30, %r8 andi. r8, r8, 0x30 # and \$0x30, %r8
#stvx v3, 0, $out # vmovdqu %xmm3, (%rdx) #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
vperm v1, v3, v3, $outperm # rotate left vperm v1, v3, v3, $outperm # rotate right/left
vsel v2, $outhead, v1, $outmask vsel v2, $outhead, v1, $outmask
vmr $outhead, v1 vmr $outhead, v1
stvx v2, 0, $out stvx v2, 0, $out
@ -1346,7 +1336,7 @@ Lschedule_mangle_dec:
addi r9, r9, 6 # add \$5,%eax addi r9, r9, 6 # add \$5,%eax
stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
cmplw $dir, $bits, $bits cmplw $dir, $bits, $bits # set encrypt direction
li r8, 0x30 # mov \$0x30,%r8d li r8, 0x30 # mov \$0x30,%r8d
bl _vpaes_schedule_core bl _vpaes_schedule_core
@ -1427,7 +1417,7 @@ Lschedule_mangle_dec:
slwi r9, r9, 4 # shl \$4,%eax slwi r9, r9, 4 # shl \$4,%eax
add $out, $out, r9 # lea (%rdx,%rax),%rdx add $out, $out, r9 # lea (%rdx,%rax),%rdx
cmplwi $dir, $bits, 0 cmplwi $dir, $bits, 0 # set decrypt direction
srwi r8, $bits, 1 # shr \$1,%r8d srwi r8, $bits, 1 # shr \$1,%r8d
andi. r8, r8, 32 # and \$32,%r8d andi. r8, r8, 32 # and \$32,%r8d
xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
@ -1470,8 +1460,48 @@ Lschedule_mangle_dec:
___ ___
} }
$code =~ s/\`([^\`]*)\`/eval($1)/gem; my $consts=1;
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
print $code; # constants table endian-specific conversion
if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
my $conv=$2;
my @bytes=();
# convert to endian-agnostic format
foreach (split(/,\s+/,$1)) {
my $l = /^0/?oct:int;
push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
}
# little-endian conversion
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}
#emit
print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
next;
}
$consts=0 if (m/Lconsts:/o); # end of table
# instructions prefixed with '?' are endian-specific and need
# to be adjusted accordingly...
if ($flavour =~ /le$/o) { # little-endian
s/\?lvsr/lvsl/o or
s/\?lvsl/lvsr/o or
s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
} else { # big-endian
s/\?([a-z]+)/$1/o;
}
print $_,"\n";
}
close STDOUT; close STDOUT;