From addb6e16a873ee9af5bd61404459b75ed1aa7226 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 18 Jan 2005 01:04:41 +0000 Subject: [PATCH] Throw in AES CBC assembler, up to +40% on aes-128-cbc benchmark. --- Configure | 6 +- TABLE | 62 +++--- crypto/aes/asm/aes-586.pl | 424 ++++++++++++++++++++++++++++++-------- crypto/perlasm/x86ms.pl | 1 + crypto/perlasm/x86nasm.pl | 1 + crypto/perlasm/x86unix.pl | 1 + 6 files changed, 371 insertions(+), 124 deletions(-) diff --git a/Configure b/Configure index 85cebe507..d2d0e9ed3 100755 --- a/Configure +++ b/Configure @@ -114,9 +114,9 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o:aes_cbc.o asm/ax86-elf.o:asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o"; -my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o:aes_cbc.o asm/ax86-cof.o:asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o"; -my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:aes_cbc.o ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o"; +my $x86_elf_asm="x86cpuid-elf.o:asm/bn86-elf.o asm/co86-elf.o:asm/dx86-elf.o asm/yx86-elf.o:asm/ax86-elf.o:asm/bx86-elf.o:asm/mx86-elf.o:asm/sx86-elf.o asm/s512sse2-elf.o:asm/cx86-elf.o:asm/rx86-elf.o:asm/rm86-elf.o:asm/r586-elf.o"; +my $x86_coff_asm="x86cpuid-cof.o:asm/bn86-cof.o asm/co86-cof.o:asm/dx86-cof.o asm/yx86-cof.o:asm/ax86-cof.o:asm/bx86-cof.o:asm/mx86-cof.o:asm/sx86-cof.o asm/s512sse2-cof.o:asm/cx86-cof.o:asm/rx86-cof.o:asm/rm86-cof.o:asm/r586-cof.o"; +my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o"; my $ia64_asm=":asm/ia64.o::aes_core.o aes_cbc.o asm/aes-ia64.o:::asm/sha1-ia64.o asm/sha256-ia64.o asm/sha512-ia64.o::asm/rc4-ia64.o::"; diff --git a/TABLE b/TABLE index 7d7fbc37d..84c9c1eb6 100644 --- a/TABLE +++ b/TABLE @@ -92,7 +92,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-cof.o $bn_obj = asm/bn86-cof.o asm/co86-cof.o $des_obj = asm/dx86-cof.o asm/yx86-cof.o -$aes_obj = aes_cbc.o asm/ax86-cof.o +$aes_obj = asm/ax86-cof.o $bf_obj = asm/bx86-cof.o $md5_obj = asm/mx86-cof.o $sha1_obj = asm/sx86-cof.o asm/s512sse2-cof.o @@ -146,7 +146,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o $bn_obj = bn86-out.o co86-out.o $des_obj = dx86-out.o yx86-out.o -$aes_obj = ase_cbc.o ax86-out.o +$aes_obj = ax86-out.o $bf_obj = bx86-out.o $md5_obj = mx86-out.o $sha1_obj = sx86-out.o s512sse2-out.o @@ -173,7 +173,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o $bn_obj = bn86-out.o co86-out.o $des_obj = dx86-out.o yx86-out.o -$aes_obj = ase_cbc.o ax86-out.o +$aes_obj = ax86-out.o $bf_obj = bx86-out.o $md5_obj = mx86-out.o $sha1_obj = sx86-out.o s512sse2-out.o @@ -227,7 +227,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -551,7 +551,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o $bn_obj = bn86-out.o co86-out.o $des_obj = dx86-out.o yx86-out.o -$aes_obj = ase_cbc.o ax86-out.o +$aes_obj = ax86-out.o $bf_obj = bx86-out.o $md5_obj = mx86-out.o $sha1_obj = sx86-out.o s512sse2-out.o @@ -767,7 +767,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -794,7 +794,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1334,7 +1334,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1685,7 +1685,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1739,7 +1739,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1766,7 +1766,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1847,7 +1847,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1874,7 +1874,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1901,7 +1901,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1928,7 +1928,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -1955,7 +1955,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -2090,7 +2090,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -2279,7 +2279,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -2738,7 +2738,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -3035,7 +3035,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-out.o $bn_obj = bn86-out.o co86-out.o $des_obj = dx86-out.o yx86-out.o -$aes_obj = ase_cbc.o ax86-out.o +$aes_obj = ax86-out.o $bf_obj = bx86-out.o $md5_obj = mx86-out.o $sha1_obj = sx86-out.o s512sse2-out.o @@ -3062,7 +3062,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -3116,7 +3116,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -3197,7 +3197,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -3332,7 +3332,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -3413,7 +3413,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -3629,7 +3629,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-cof.o $bn_obj = asm/bn86-cof.o asm/co86-cof.o $des_obj = asm/dx86-cof.o asm/yx86-cof.o -$aes_obj = aes_cbc.o asm/ax86-cof.o +$aes_obj = asm/ax86-cof.o $bf_obj = asm/bx86-cof.o $md5_obj = asm/mx86-cof.o $sha1_obj = asm/sx86-cof.o asm/s512sse2-cof.o @@ -4034,7 +4034,7 @@ $bn_ops = DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -4061,7 +4061,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -4277,7 +4277,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -4547,7 +4547,7 @@ $bn_ops = BN_LLONG MD2_CHAR RC4_INDEX DES_PTR DES_RISC1 DES_UNROLL $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o @@ -4574,7 +4574,7 @@ $bn_ops = BN_LLONG DES_PTR DES_RISC1 DES_UNROLL RC4_INDEX MD2_INT $cpuid_obj = x86cpuid-elf.o $bn_obj = asm/bn86-elf.o asm/co86-elf.o $des_obj = asm/dx86-elf.o asm/yx86-elf.o -$aes_obj = aes_cbc.o asm/ax86-elf.o +$aes_obj = asm/ax86-elf.o $bf_obj = asm/bx86-elf.o $md5_obj = asm/mx86-elf.o $sha1_obj = asm/sx86-elf.o asm/s512sse2-elf.o diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl index 688fda21f..4263a7e1d 100755 --- a/crypto/aes/asm/aes-586.pl +++ b/crypto/aes/asm/aes-586.pl @@ -6,7 +6,7 @@ # forms are granted according to the OpenSSL license. # ==================================================================== # -# Version 2.0. +# Version 3.0. # # You might fail to appreciate this module performance from the first # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered @@ -58,31 +58,32 @@ # (*) Performance difference between small footprint code and fully # unrolled in more commonly used CBC mode is not as big, 7% for # PIII and 15% for Pentium, which I consider tolerable. +# +# Third version adds AES_cbc_encrypt implementation, which resulted in +# up to 40% performance imrovement of CBC benchmark results [on most +# recent µ-archs]. CBC performance is virtually as good as ECB now and +# sometimes even better, because function prologues and epilogues are +# effectively taken out of the loop... push(@INC,"perlasm","../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"aes-586.pl",$ARGV[$#ARGV] eq "386"); +$s0="eax"; +$s1="ebx"; +$s2="ecx"; +$s3="edx"; +$key="edi"; +$acc="esi"; + $small_footprint=1; # $small_footprint=1 code is ~5% slower [on # recent µ-archs], but ~5 times smaller! # I favor compact code, because it minimizes # cache contention... $vertical_spin=0; # shift "verticaly" defaults to 0, because of - # its proof-of-concept status, see below... + # its proof-of-concept status... -$s0="eax"; -$s1="ebx"; -$s2="ecx"; -$s3="edx"; -$key="esi"; -$acc="edi"; - -if ($vertical_spin) { - # I need high parts of volatile registers to be accessible... - $s1="esi"; $key="ebx"; - $s2="edi"; $acc="ecx"; -} # Note that there is no decvert(), as well as last encryption round is # performed with "horizontal" shifts. This is because this "vertical" # implementation [one which groups shifts on a given $s[i] to form a @@ -193,12 +194,12 @@ sub enclast() &and ($out,0xFF); if ($i==1) { &shr ($s[0],16); }#%ebx[1] if ($i==2) { &shr ($s[0],24); }#%ecx[2] - &mov ($out,&DWP(1024*0,$te,$out,4)); + &mov ($out,&DWP(1024*4,$te,$out,4)); &and ($out,0x000000ff); if ($i==3) { $tmp=$s[1]; }##%eax &movz ($tmp,&HB($s[1])); - &mov ($tmp,&DWP(0,$te,$tmp,4)); + &mov ($tmp,&DWP(1024*4,$te,$tmp,4)); &and ($tmp,0x0000ff00); &xor ($out,$tmp); @@ -207,7 +208,7 @@ sub enclast() &shr ($tmp,16); } if ($i==2) { &and ($s[1],0xFF); }#%edx[2] &and ($tmp,0xFF); - &mov ($tmp,&DWP(0,$te,$tmp,4)); + &mov ($tmp,&DWP(1024*4,$te,$tmp,4)); &and ($tmp,0x00ff0000); &xor ($out,$tmp); @@ -215,38 +216,30 @@ sub enclast() elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2] else { &mov ($tmp,$s[3]); &shr ($tmp,24); } - &mov ($tmp,&DWP(0,$te,$tmp,4)); + &mov ($tmp,&DWP(1024*4,$te,$tmp,4)); &and ($tmp,0xff000000); &xor ($out,$tmp); if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); } if ($i==3) { &mov ($s[3],$acc); } } -# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); &public_label("AES_Te"); -&function_begin("AES_encrypt"); - &mov ($acc,&wparam(0)); # load inp - &mov ($key,&wparam(2)); # load key - - &call (&label("pic_point")); # make it PIC! - &set_label("pic_point"); - &blindpop("ebp"); - &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); +&function_begin_B("_x86_AES_encrypt"); + if ($vertical_spin) { + # I need high parts of volatile registers to be accessible... + &exch ($s1="edi",$key="ebx"); + &mov ($s2="esi",$acc="ecx"); + } # allocate aligned stack frame - &mov ($s0,"esp"); + &mov ($acc,"esp"); &sub ("esp",20); &and ("esp",-16); &mov (&DWP(12,"esp"),$key); # save key - &mov (&DWP(16,"esp"),$s0); # save %esp + &mov (&DWP(16,"esp"),$acc); # save %esp - &mov ($s0,&DWP(0,$acc)); # load input data - &mov ($s1,&DWP(4,$acc)); - &mov ($s2,&DWP(8,$acc)); - &mov ($s3,&DWP(12,$acc)); - - &xor ($s0,&DWP(0,$key)); + &xor ($s0,&DWP(0,$key)); # xor with key &xor ($s1,&DWP(4,$key)); &xor ($s2,&DWP(8,$key)); &xor ($s3,&DWP(12,$key)); @@ -333,11 +326,10 @@ sub enclast() } } - &add ("ebp",4*1024); # skip to Te4 if ($vertical_spin) { # "reincarnate" some registers for "horizontal" spin... - &mov ($s1="ebx",$key="esi"); - &mov ($s2="ecx",$acc="edi"); + &mov ($s1="ebx",$key="edi"); + &mov ($s2="ecx",$acc="esi"); } &enclast(0,"ebp",$s0,$s1,$s2,$s3); &enclast(1,"ebp",$s1,$s2,$s3,$s0); @@ -351,16 +343,6 @@ sub enclast() &xor ($s2,&DWP(8,$key)); &xor ($s3,&DWP(12,$key)); - &mov ($acc,&wparam(1)); # load out - &mov (&DWP(0,$acc),$s0); # write output data - &mov (&DWP(4,$acc),$s1); - &mov (&DWP(8,$acc),$s2); - &mov (&DWP(12,$acc),$s3); - - &pop ("edi"); - &pop ("esi"); - &pop ("ebx"); - &pop ("ebp"); &ret (); &set_label("AES_Te",64); # Yes! I keep it in the code segment! @@ -692,17 +674,35 @@ sub enclast() &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008); &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080); &data_word(0x0000001b, 0x00000036); -&function_end_B("AES_encrypt"); +&function_end_B("_x86_AES_encrypt"); + +# void AES_encrypt (const void *inp,void *out,const AES_KEY *key); +&public_label("AES_Te"); +&function_begin("AES_encrypt"); + &mov ($acc,&wparam(0)); # load inp + &mov ($key,&wparam(2)); # load key + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop("ebp"); + &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); + + &mov ($s0,&DWP(0,$acc)); # load input data + &mov ($s1,&DWP(4,$acc)); + &mov ($s2,&DWP(8,$acc)); + &mov ($s3,&DWP(12,$acc)); + + &call ("_x86_AES_encrypt"); + + &mov ($acc,&wparam(1)); # load out + &mov (&DWP(0,$acc),$s0); # write output data + &mov (&DWP(4,$acc),$s1); + &mov (&DWP(8,$acc),$s2); + &mov (&DWP(12,$acc),$s3); +&function_end("AES_encrypt"); #------------------------------------------------------------------# -$s0="eax"; -$s1="ebx"; -$s2="ecx"; -$s3="edx"; -$key="edi"; -$acc="esi"; - sub decstep() { my ($i,$td,@s) = @_; my $tmp = $key; @@ -744,12 +744,12 @@ sub declast() if($i==3) { &mov ($key,&DWP(12,"esp")); } else { &mov ($out,$s[0]); } &and ($out,0xFF); - &mov ($out,&DWP(0,$td,$out,4)); + &mov ($out,&DWP(1024*4,$td,$out,4)); &and ($out,0x000000ff); if ($i==3) { $tmp=$s[1]; } &movz ($tmp,&HB($s[1])); - &mov ($tmp,&DWP(0,$td,$tmp,4)); + &mov ($tmp,&DWP(1024*4,$td,$tmp,4)); &and ($tmp,0x0000ff00); &xor ($out,$tmp); @@ -757,45 +757,31 @@ sub declast() else { mov ($tmp,$s[2]); } &shr ($tmp,16); &and ($tmp,0xFF); - &mov ($tmp,&DWP(0,$td,$tmp,4)); + &mov ($tmp,&DWP(1024*4,$td,$tmp,4)); &and ($tmp,0x00ff0000); &xor ($out,$tmp); if ($i==3) { $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); } else { &mov ($tmp,$s[3]); } &shr ($tmp,24); - &mov ($tmp,&DWP(0,$td,$tmp,4)); + &mov ($tmp,&DWP(1024*4,$td,$tmp,4)); &and ($tmp,0xff000000); &xor ($out,$tmp); if ($i<2) { &mov (&DWP(4*$i,"esp"),$out); } if ($i==3) { &mov ($s[3],&DWP(0,"esp")); } } -# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); &public_label("AES_Td"); -&function_begin("AES_decrypt"); - &mov ($acc,&wparam(0)); # load inp - &mov ($key,&wparam(2)); # load key - - &call (&label("pic_point")); # make it PIC! - &set_label("pic_point"); - &blindpop("ebp"); - &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); - +&function_begin_B("_x86_AES_decrypt"); # allocate aligned stack frame - &mov ($s0,"esp"); + &mov ($acc,"esp"); &sub ("esp",20); &and ("esp",-16); &mov (&DWP(12,"esp"),$key); # save key - &mov (&DWP(16,"esp"),$s0); # save %esp + &mov (&DWP(16,"esp"),$acc); # save %esp - &mov ($s0,&DWP(0,$acc)); # load input data - &mov ($s1,&DWP(4,$acc)); - &mov ($s2,&DWP(8,$acc)); - &mov ($s3,&DWP(12,$acc)); - - &xor ($s0,&DWP(0,$key)); + &xor ($s0,&DWP(0,$key)); # xor with key &xor ($s1,&DWP(4,$key)); &xor ($s2,&DWP(8,$key)); &xor ($s3,&DWP(12,$key)); @@ -866,7 +852,6 @@ sub declast() } } - &add ("ebp",4*1024); # skip to Te4 &declast(0,"ebp",$s0,$s3,$s2,$s1); &declast(1,"ebp",$s1,$s0,$s3,$s2); &declast(2,"ebp",$s2,$s1,$s0,$s3); @@ -879,16 +864,6 @@ sub declast() &xor ($s2,&DWP(8,$key)); &xor ($s3,&DWP(12,$key)); - &mov ($key,&wparam(1)); # load out - &mov (&DWP(0,$key),$s0); # write output data - &mov (&DWP(4,$key),$s1); - &mov (&DWP(8,$key),$s2); - &mov (&DWP(12,$key),$s3); - - &pop ("edi"); - &pop ("esi"); - &pop ("ebx"); - &pop ("ebp"); &ret (); &set_label("AES_Td",64); # Yes! I keep it in the code segment! @@ -1216,7 +1191,276 @@ sub declast() &data_word(0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626); &data_word(0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363); &data_word(0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d); -&function_end_B("AES_decrypt"); +&function_end_B("_x86_AES_decrypt"); + +# void AES_decrypt (const void *inp,void *out,const AES_KEY *key); +&public_label("AES_Td"); +&function_begin("AES_decrypt"); + &mov ($acc,&wparam(0)); # load inp + &mov ($key,&wparam(2)); # load key + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop("ebp"); + &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); + + &mov ($s0,&DWP(0,$acc)); # load input data + &mov ($s1,&DWP(4,$acc)); + &mov ($s2,&DWP(8,$acc)); + &mov ($s3,&DWP(12,$acc)); + + &call ("_x86_AES_decrypt"); + + &mov ($acc,&wparam(1)); # load out + &mov (&DWP(0,$acc),$s0); # write output data + &mov (&DWP(4,$acc),$s1); + &mov (&DWP(8,$acc),$s2); + &mov (&DWP(12,$acc),$s3); +&function_end("AES_decrypt"); + +# void AES_cbc_encrypt (const void char *inp, unsigned char *out, +# size_t length, const AES_KEY *key, +# unsigned char *ivp,const int enc); +&public_label("AES_Te"); +&public_label("AES_Td"); +&function_begin("AES_cbc_encrypt"); + &mov ($s2 eq "ecx"? $s2 : "",&wparam(2)); # load len + &cmp ($s2,0); + &je (&label("enc_out")); + + &call (&label("pic_point")); # make it PIC! + &set_label("pic_point"); + &blindpop("ebp"); + + &cmp (&wparam(5),0); + &je (&label("DECRYPT")); + + &lea ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp")); + + &mov ($acc,&wparam(0)); # load inp + &mov ($key,&wparam(4)); # load ivp + + &test ($s2,~15); + &jz (&label("enc_tail")); # short input... + + &mov ($s0,&DWP(0,$key)); # load iv + &mov ($s1,&DWP(4,$key)); + + &align (4); + &set_label("enc_loop"); + &mov ($s2,&DWP(8,$key)); + &mov ($s3,&DWP(12,$key)); + + &xor ($s0,&DWP(0,$acc)); # xor input data + &xor ($s1,&DWP(4,$acc)); + &xor ($s2,&DWP(8,$acc)); + &xor ($s3,&DWP(12,$acc)); + + &mov ($key,&wparam(3)); # load key + &call ("_x86_AES_encrypt"); + + &mov ($acc,&wparam(0)); # load inp + &mov ($key,&wparam(1)); # load out + + &mov (&DWP(0,$key),$s0); # save output data + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov ($s2,&wparam(2)); # load len + + &lea ($acc,&DWP(16,$acc)); + &mov (&wparam(0),$acc); # save inp + + &lea ($s3,&DWP(16,$key)); + &mov (&wparam(1),$s3); # save out + + &sub ($s2,16); + &test ($s2,~15); + &mov (&wparam(2),$s2); # save len + &jnz (&label("enc_loop")); + &test ($s2,15); + &jnz (&label("enc_tail")); + &mov ($acc,&wparam(4)); # load ivp + &mov ($s2,&DWP(8,$key)); # restore last dwords + &mov ($s3,&DWP(12,$key)); + &mov (&DWP(0,$acc),$s0); # save iv + &mov (&DWP(4,$acc),$s1); + &mov (&DWP(8,$acc),$s2); + &mov (&DWP(12,$acc),$s3); + &set_label("enc_out"); + &function_end_A(); + + &align (4); + &set_label("enc_tail"); + &push ($key eq "edi" ? $key : ""); # push ivp + &pushf (); + &mov ($key,&wparam(1)); # load out + &xor ($s0,$s0); + &mov (&DWP(0,$key),$s0); # zero output + &mov (&DWP(4,$key),$s0); + &mov (&DWP(8,$key),$s0); + &mov (&DWP(12,$key),$s0); + &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input + &popf (); + &pop ($key); # pop ivp + + &mov ($acc,&wparam(1)); # output as input + &mov ($s0,&DWP(0,$key)); + &mov ($s1,&DWP(4,$key)); + &mov (&wparam(2),16); # len=16 + &jmp (&label("enc_loop")); # one more spin... + +#----------------------------- DECRYPT -----------------------------# +&align (4); +&set_label("DECRYPT"); + &stack_push(5); # allocate temp + ivp + + &lea ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp")); + + &mov ($acc,&wparam(0)); # load inp + &cmp ($acc,&wparam(1)); + &je (&label("dec_in_place")); # in-place processing... + + &mov ($key,&wparam(4)); # load ivp + &mov (&swtmp(4),$key); + + &align (4); + &set_label("dec_loop"); + &mov ($s0,&DWP(0,$acc)); # read input + &mov ($s1,&DWP(4,$acc)); + &mov ($s2,&DWP(8,$acc)); + &mov ($s3,&DWP(12,$acc)); + + &mov ($key,&wparam(3)); # load key + &call ("_x86_AES_decrypt"); + + &mov ($key,&swtmp(4)); # load ivp + &mov ($acc,&wparam(2)); # load len + &xor ($s0,&DWP(0,$key)); # xor iv + &xor ($s1,&DWP(4,$key)); + &xor ($s2,&DWP(8,$key)); + &xor ($s3,&DWP(12,$key)); + + &sub ($acc,16); + &jc (&label("dec_partial")); + &mov (&wparam(2),$acc); # save len + &mov ($acc,&wparam(0)); # load inp + &mov ($key,&wparam(1)); # load out + + &mov (&DWP(0,$key),$s0); # write output + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov (&swtmp(4),$acc); # save ivp + &lea ($acc,&DWP(16,$acc)); + &mov (&wparam(0),$acc); # save inp + + &lea ($key,&DWP(16,$key)); + &mov (&wparam(1),$key); # save out + + &jnz (&label("dec_loop")); + &mov ($key,&swtmp(4)); # load temp ivp + &set_label("dec_end"); + &mov ($acc,&wparam(4)); # load user ivp + &mov ($s0,&DWP(0,$key)); # load iv + &mov ($s1,&DWP(4,$key)); + &mov ($s2,&DWP(8,$key)); + &mov ($s3,&DWP(12,$key)); + &mov (&DWP(0,$acc),$s0); # copy back to user + &mov (&DWP(4,$acc),$s1); + &mov (&DWP(8,$acc),$s2); + &mov (&DWP(12,$acc),$s3); + &jmp (&label("dec_out")); + + &align (4); + &set_label("dec_partial"); + &lea ($key,&swtmp(0)); + &mov (&DWP(0,$key),$s0); # dump output to stack + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + &lea ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc)); + &mov ($acc eq "esi" ? $acc : "",$key); + &mov ($key eq "edi" ? $key : "",&wparam(1)); + &pushf (); + &data_word(0x90A4F3FC); # cld; rep movsb; nop # copy output + &popf (); + &mov ($key,&wparam(0)); # load temp ivp + &jmp (&label("dec_end")); + + &align (4); + &set_label("dec_in_place"); + &set_label("dec_in_place_loop"); + &lea ($key,&swtmp(0)); + &mov ($s0,&DWP(0,$acc)); # read input + &mov ($s1,&DWP(4,$acc)); + &mov ($s2,&DWP(8,$acc)); + &mov ($s3,&DWP(12,$acc)); + + &mov (&DWP(0,$key),$s0); # copy to temp + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov ($key,&wparam(3)); # load key + &call ("_x86_AES_decrypt"); + + &mov ($key,&wparam(4)); # load ivp + &mov ($acc,&wparam(1)); # load out + &xor ($s0,&DWP(0,$key)); # xor iv + &xor ($s1,&DWP(4,$key)); + &xor ($s2,&DWP(8,$key)); + &xor ($s3,&DWP(12,$key)); + + &mov (&DWP(0,$acc),$s0); # write output + &mov (&DWP(4,$acc),$s1); + &mov (&DWP(8,$acc),$s2); + &mov (&DWP(12,$acc),$s3); + + &lea ($acc,&DWP(16,$acc)); + &mov (&wparam(1),$acc); # save out + + &lea ($acc,&swtmp(0)); + &mov ($s0,&DWP(0,$acc)); # read temp + &mov ($s1,&DWP(4,$acc)); + &mov ($s2,&DWP(8,$acc)); + &mov ($s3,&DWP(12,$acc)); + + &mov (&DWP(0,$key),$s0); # copy iv + &mov (&DWP(4,$key),$s1); + &mov (&DWP(8,$key),$s2); + &mov (&DWP(12,$key),$s3); + + &mov ($acc,&wparam(0)); # load inp + + &lea ($acc,&DWP(16,$acc)); + &mov (&wparam(0),$acc); # save inp + + &mov ($s2,&wparam(2)); # load len + &sub ($s2,16); + &jc (&label("dec_in_place_partial")); + &mov (&wparam(2),$s2); # save len + &jnz (&label("dec_in_place_loop")); + &jmp (&label("dec_out")); + + &align (4); + &set_label("dec_in_place_partial"); + # one can argue if this is actually required... + &mov ($key eq "edi" ? $key : "",&wparam(1)); + &lea ($acc eq "esi" ? $acc : "",&swtmp(0)); + &lea ($key,&DWP(0,$key,$s2)); + &lea ($acc,&DWP(16,$acc,$s2)); + &neg ($s2 eq "ecx" ? $s2 : ""); + &pushf (); + &data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail + &popf (); + &set_label("dec_out"); + &stack_pop(5); +&function_end("AES_cbc_encrypt"); + +#------------------------------------------------------------------# sub enckey() { diff --git a/crypto/perlasm/x86ms.pl b/crypto/perlasm/x86ms.pl index 41fe9715d..35ba300d0 100644 --- a/crypto/perlasm/x86ms.pl +++ b/crypto/perlasm/x86ms.pl @@ -175,6 +175,7 @@ sub main'cpuid { &out0("DW\t0A20Fh"); } sub main'rdtsc { &out0("DW\t0310Fh"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } +sub main'neg { &out1("neg",@_); } # SSE2 sub main'emms { &out0("emms"); } diff --git a/crypto/perlasm/x86nasm.pl b/crypto/perlasm/x86nasm.pl index 9792f13e1..b897ed0fd 100644 --- a/crypto/perlasm/x86nasm.pl +++ b/crypto/perlasm/x86nasm.pl @@ -193,6 +193,7 @@ sub main'cpuid { &out0("cpuid"); } sub main'rdtsc { &out0("rdtsc"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzx",@_); } +sub main'neg { &out1("neg",@_); } # SSE2 sub main'emms { &out0("emms"); } diff --git a/crypto/perlasm/x86unix.pl b/crypto/perlasm/x86unix.pl index 59c4fbc08..4381580be 100644 --- a/crypto/perlasm/x86unix.pl +++ b/crypto/perlasm/x86unix.pl @@ -209,6 +209,7 @@ sub main'cpuid { &out0(".byte\t0x0f,0xa2"); } sub main'rdtsc { &out0(".byte\t0x0f,0x31"); } sub main'halt { &out0("hlt"); } sub main'movz { &out2("movzbl",@_); } +sub main'neg { &out1("negl",@_); } # SSE2 sub main'emms { &out0("emms"); }