From aa8f38e49b2430a1939d7e9a8d2ecaa77edbb1a9 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 12 Nov 2008 08:15:52 +0000 Subject: [PATCH] x86_64 assembler pack to comply with updated styling x86_64-xlate.pl rules. --- crypto/aes/asm/aes-x86_64.pl | 23 ++-- crypto/bn/asm/x86_64-gcc.c | 4 +- crypto/rc4/asm/rc4-x86_64.pl | 3 +- crypto/sha/asm/sha512-x86_64.pl | 9 +- crypto/whrlpool/asm/wp-x86_64.pl | 3 +- crypto/x86_64cpuid.pl | 205 ++++++++++++++----------------- 6 files changed, 111 insertions(+), 136 deletions(-) diff --git a/crypto/aes/asm/aes-x86_64.pl b/crypto/aes/asm/aes-x86_64.pl index da4253889..d04150734 100755 --- a/crypto/aes/asm/aes-x86_64.pl +++ b/crypto/aes/asm/aes-x86_64.pl @@ -617,8 +617,7 @@ AES_encrypt: push $key # pick Te4 copy which can't "overlap" with stack frame or key schedule - .picmeup $sbox - lea AES_Te+2048-.($sbox),$sbox + lea .LAES_Te+2048(%rip),$sbox lea 768(%rsp),%rbp sub $sbox,%rbp and \$0x300,%rbp @@ -1210,8 +1209,7 @@ AES_decrypt: push $key # pick Td4 copy which can't "overlap" with stack frame or key schedule - .picmeup $sbox - lea AES_Td+2048-.($sbox),$sbox + lea .LAES_Td+2048(%rip),$sbox lea 768(%rsp),%rbp sub $sbox,%rbp and \$0x300,%rbp @@ -1292,8 +1290,7 @@ _x86_64_AES_set_encrypt_key: test \$-1,%rdi jz .Lbadpointer - .picmeup %rbp - lea AES_Te-.(%rbp),%rbp + lea .LAES_Te(%rip),%rbp lea 2048+128(%rbp),%rbp # prefetch Te4 @@ -1564,8 +1561,7 @@ AES_set_decrypt_key: cmp %rsi,%rdi jne .Linvert - .picmeup %rax - lea AES_Te+2048+1024-.(%rax),%rax # rcon + lea .LAES_Te+2048+1024(%rip),%rax # rcon mov 40(%rax),$mask80 mov 48(%rax),$maskfe @@ -1636,11 +1632,10 @@ AES_cbc_encrypt: cld mov %r9d,%r9d # clear upper half of enc - .picmeup $sbox - lea AES_Te-.($sbox),$sbox + lea .LAES_Te(%rip),$sbox cmp \$0,%r9 jne .Lcbc_picked_te - lea AES_Td-AES_Te($sbox),$sbox + lea .LAES_Td(%rip),$sbox .Lcbc_picked_te: mov OPENSSL_ia32cap_P(%rip),%eax @@ -2066,9 +2061,8 @@ ___ } $code.=<<___; -.globl AES_Te .align 64 -AES_Te: +.LAES_Te: ___ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6); &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591); @@ -2275,9 +2269,8 @@ $code.=<<___; .long 0xfefefefe, 0xfefefefe, 0x1b1b1b1b, 0x1b1b1b1b ___ $code.=<<___; -.globl AES_Td .align 64 -AES_Td: +.LAES_Td: ___ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a); &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b); diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c index f13f52dd8..c4d941d0b 100644 --- a/crypto/bn/asm/x86_64-gcc.c +++ b/crypto/bn/asm/x86_64-gcc.c @@ -182,7 +182,7 @@ BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) asm ( " subq %2,%2 \n" - ".align 16 \n" + ".p2align 4 \n" "1: movq (%4,%2,8),%0 \n" " adcq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" @@ -205,7 +205,7 @@ BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n) asm ( " subq %2,%2 \n" - ".align 16 \n" + ".p2align 4 \n" "1: movq (%4,%2,8),%0 \n" " sbbq (%5,%2,8),%0 \n" " movq %0,(%3,%2,8) \n" diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index c2af3109a..959a67a86 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -336,8 +336,7 @@ RC4_set_key: .type RC4_options,\@function,0 .align 16 RC4_options: - .picmeup %rax - lea .Lopts-.(%rax),%rax + lea .Lopts(%rip),%rax mov OPENSSL_ia32cap_P(%rip),%edx bt \$20,%edx jnc .Ldone diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index b6252d31e..10fd2abb6 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -40,14 +40,16 @@ # sha256_block:-( This is presumably because 64-bit shifts/rotates # apparently are not atomic instructions, but implemented in microcode. -$output=shift; +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open STDOUT,"| $^X $xlate $output"; +open STDOUT,"| $^X $xlate $flavour $output"; if ($output =~ /512/) { $func="sha512_block_data_order"; @@ -196,8 +198,7 @@ $func: mov %rdx,$_end # save end pointer, "3rd" arg mov %rbp,$_rsp # save copy of %rsp - .picmeup $Tbl - lea $TABLE-.($Tbl),$Tbl + lea $TABLE(%rip),$Tbl mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B diff --git a/crypto/whrlpool/asm/wp-x86_64.pl b/crypto/whrlpool/asm/wp-x86_64.pl index 41bf3b202..aaed35341 100644 --- a/crypto/whrlpool/asm/wp-x86_64.pl +++ b/crypto/whrlpool/asm/wp-x86_64.pl @@ -71,8 +71,7 @@ $func: mov %rdx,16(%rbx) mov %rax,32(%rbx) # saved stack pointer - .picmeup %rbp - lea $table-.(%rbp),%rbp + lea $table(%rip),%rbp xor %rcx,%rcx xor %rdx,%rdx diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index e19ecdbbf..c54b9e368 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -1,110 +1,37 @@ #!/usr/bin/env perl -$output=shift; -$masm=1 if ($output =~ /\.asm/); -open STDOUT,">$output" || die "can't open $output: $!"; +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } -print<<___ if(defined($masm)); -_TEXT SEGMENT -PUBLIC OPENSSL_rdtsc +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); -PUBLIC OPENSSL_atomic_add -ALIGN 16 -OPENSSL_atomic_add PROC - mov eax,DWORD PTR[rcx] -\$Lspin: lea r8,DWORD PTR[rdx+rax] -lock cmpxchg DWORD PTR[rcx],r8d - jne \$Lspin - mov eax,r8d - cdqe - ret -OPENSSL_atomic_add ENDP - -PUBLIC OPENSSL_wipe_cpu -ALIGN 16 -OPENSSL_wipe_cpu PROC - pxor xmm0,xmm0 - pxor xmm1,xmm1 - pxor xmm2,xmm2 - pxor xmm3,xmm3 - pxor xmm4,xmm4 - pxor xmm5,xmm5 - xor rcx,rcx - xor rdx,rdx - xor r8,r8 - xor r9,r9 - xor r10,r10 - xor r11,r11 - lea rax,QWORD PTR[rsp+8] - ret -OPENSSL_wipe_cpu ENDP -_TEXT ENDS - -CRT\$XIU SEGMENT -EXTRN OPENSSL_cpuid_setup:PROC -DQ OPENSSL_cpuid_setup -CRT\$XIU ENDS - -___ -print<<___ if(!defined($masm)); -.text - -.globl OPENSSL_atomic_add -.type OPENSSL_atomic_add,\@function -.align 16 -OPENSSL_atomic_add: - movl (%rdi),%eax -.Lspin: leaq (%rsi,%rax),%r8 -lock; cmpxchgl %r8d,(%rdi) - jne .Lspin - movl %r8d,%eax - .byte 0x48,0x98 - ret -.size OPENSSL_atomic_add,.-OPENSSL_atomic_add - -.globl OPENSSL_wipe_cpu -.type OPENSSL_wipe_cpu,\@function -.align 16 -OPENSSL_wipe_cpu: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - pxor %xmm10,%xmm10 - pxor %xmm11,%xmm11 - pxor %xmm12,%xmm12 - pxor %xmm13,%xmm13 - pxor %xmm14,%xmm14 - pxor %xmm15,%xmm15 - xorq %rcx,%rcx - xorq %rdx,%rdx - xorq %rsi,%rsi - xorq %rdi,%rdi - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - leaq 8(%rsp),%rax - ret -.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $flavour $output"; +if ($win64) { $arg1="%rcx"; $arg2="%rdx"; } +else { $arg1="%rdi"; $arg2="%rsi"; } +print<<___; +.extern OPENSSL_cpuid_setup .section .init call OPENSSL_cpuid_setup -___ - -$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; -open STDOUT,"| $^X ${dir}perlasm/x86_64-xlate.pl $output"; - -print<<___; .text +.globl OPENSSL_atomic_add +.type OPENSSL_atomic_add,\@abi-omnipotent +.align 16 +OPENSSL_atomic_add: + movl ($arg1),%eax +.Lspin: leaq ($arg2,%rax),%r8 + .byte 0xf0 # lock + cmpxchgl %r8d,($arg1) + jne .Lspin + movl %r8d,%eax + .byte 0x48,0x98 # cltq/cdqe + ret +.size OPENSSL_atomic_add,.-OPENSSL_atomic_add + .globl OPENSSL_rdtsc .type OPENSSL_rdtsc,\@abi-omnipotent .align 16 @@ -159,35 +86,91 @@ OPENSSL_ia32_cpuid: .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid .globl OPENSSL_cleanse -.type OPENSSL_cleanse,\@function,2 +.type OPENSSL_cleanse,\@abi-omnipotent .align 16 OPENSSL_cleanse: xor %rax,%rax - cmp \$15,%rsi + cmp \$15,$arg2 jae .Lot .Little: - mov %al,(%rdi) - sub \$1,%rsi - lea 1(%rdi),%rdi + mov %al,($arg1) + sub \$1,$arg2 + lea 1($arg1),$arg1 jnz .Little ret .align 16 .Lot: - test \$7,%rdi + test \$7,$arg1 jz .Laligned - mov %al,(%rdi) - lea -1(%rsi),%rsi - lea 1(%rdi),%rdi + mov %al,($arg1) + lea -1($arg2),$arg2 + lea 1($arg1),$arg1 jmp .Lot .Laligned: - mov %rax,(%rdi) - lea -8(%rsi),%rsi - test \$-8,%rsi - lea 8(%rdi),%rdi + mov %rax,($arg1) + lea -8($arg2),$arg2 + test \$-8,$arg2 + lea 8($arg1),$arg1 jnz .Laligned - cmp \$0,%rsi + cmp \$0,$arg2 jne .Little ret .size OPENSSL_cleanse,.-OPENSSL_cleanse ___ + +print<<___ if (!$win64); +.globl OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,\@abi-omnipotent +.align 16 +OPENSSL_wipe_cpu: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + xorq %rcx,%rcx + xorq %rdx,%rdx + xorq %rsi,%rsi + xorq %rdi,%rdi + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + leaq 8(%rsp),%rax + ret +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu +___ +print<<___ if ($win64); +.globl OPENSSL_wipe_cpu +.type OPENSSL_wipe_cpu,\@abi-omnipotent +.align 16 +OPENSSL_wipe_cpu: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorq %rcx,%rcx + xorq %rdx,%rdx + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + leaq 8(%rsp),%rax + ret +.size OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu +___ + close STDOUT; # flush