crypto/bn/rsaz*: fix licensing note.

rsaz_exp.c: harmonize line terminating; asm/rsaz-*.pl: minor optimizations.
2013-12-03 22:05:17 +01:00
parent 6efef384c6
commit 31ed9a2131
3 changed files with 500 additions and 459 deletions
--- a/crypto/bn/asm/rsaz-avx2.pl
+++ b/crypto/bn/asm/rsaz-avx2.pl
@@ -1,54 +1,66 @@
 #!/usr/bin/env perl
-#******************************************************************************
+##############################################################################
-#* Copyright(c) 2012, Intel Corp.                                             
+#                                                                            #
-#* Developers and authors:                                                    
+#  Copyright (c) 2012, Intel Corporation                                     #
-#* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
+#                                                                            #
-#* (1) Intel Corporation, Israel Development Center, Haifa, Israel
+#  All rights reserved.                                                      #
-#* (2) University of Haifa, Israel                                              
+#                                                                            #
-#******************************************************************************
+#  Redistribution and use in source and binary forms, with or without        #
-#* LICENSE:                                                                
+#  modification, are permitted provided that the following conditions are    #
-#* This submission to OpenSSL is to be made available under the OpenSSL  
+#  met:                                                                      #
-#* license, and only to the OpenSSL project, in order to allow integration    
+#                                                                            #
-#* into the publicly distributed code. 
+#  *  Redistributions of source code must retain the above copyright         #
-#* The use of this code, or portions of this code, or concepts embedded in
+#     notice, this list of conditions and the following disclaimer.          #
-#* this code, or modification of this code and/or algorithm(s) in it, or the
+#                                                                            #
-#* use of this code for any other purpose than stated above, requires special
+#  *  Redistributions in binary form must reproduce the above copyright      #
-#* licensing.                                                                  
+#     notice, this list of conditions and the following disclaimer in the    #
-#******************************************************************************
+#     documentation and/or other materials provided with the                 #
-#* DISCLAIMER:                                                                
+#     distribution.                                                          #
-#* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
+#                                                                            #
-#* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
+#  *  Neither the name of the Intel Corporation nor the names of its         #
-#* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+#     contributors may be used to endorse or promote products derived from   #
-#* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
+#     this software without specific prior written permission.               #
-#* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
+#                                                                            #
-#* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
+#                                                                            #
-#* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
+#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
-#* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
-#* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
-#* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
-#* POSSIBILITY OF SUCH DAMAGE.                                                
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
-#******************************************************************************
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
-#* Reference:                                                                 
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
-#* [1]	S. Gueron, V. Krasnov: "Software Implementation of Modular
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
-#*	Exponentiation,  Using Advanced Vector Instructions Architectures",
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
-#*	F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
-#*	pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
-#* [2]	S. Gueron: "Efficient Software Implementations of Modular
+#                                                                            #
-#*	Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
+##############################################################################
-#* [3]	S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
+# Developers and authors:                                                    #
-#*	Proceedings of 9th International Conference on Information Technology:
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
-#*	New Generations (ITNG 2012), pp.821-823 (2012)
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel            #
-#* [4]	S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
+# (2) University of Haifa, Israel                                            #
-#*	resistant 1024-bit modular exponentiation, for optimizing RSA2048
+##############################################################################
-#*	on AVX2 capable x86_64 platforms",
+# Reference:                                                                 #
-#*	http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular             #
-#******************************************************************************
+#     Exponentiation,  Using Advanced Vector Instructions Architectures",    #
-
+#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,   #
-# +10% improvement by <appro@openssl.org>
+#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012              #
 # [2] S. Gueron: "Efficient Software Implementations of Modular              #
 #     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).  #
 # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE         #
 #     Proceedings of 9th International Conference on Information Technology: #
 #     New Generations (ITNG 2012), pp.821-823 (2012)                         #
 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
 #     resistant 1024-bit modular exponentiation, for optimizing RSA2048      #
 #     on AVX2 capable x86_64 platforms",                                     #
 #     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
 ##############################################################################
 #
 # +13% improvement over original submission by <appro@openssl.org>
 #
 # rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
-# 2.3GHz Haswell	621		732/+18%	1112/+79%
+# 2.3GHz Haswell	621		765/+23%	1113/+79%
 #
 # (*)	if system doesn't support AVX2, for reference purposes;
@@ -143,24 +155,24 @@ rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
 	push	%r13
 	push	%r14
 	push	%r15
 	vzeroupper
 ___
 $code.=<<___ if ($win64);
 	lea	-0xa8(%rsp),%rsp
-	movaps  %xmm6,-0xd8(%rax)
+	vmovaps	%xmm6,-0xd8(%rax)
-	movaps  %xmm7,-0xc8(%rax)
+	vmovaps	%xmm7,-0xc8(%rax)
-	movaps  %xmm8,-0xb8(%rax)
+	vmovaps	%xmm8,-0xb8(%rax)
-	movaps  %xmm9,-0xa8(%rax)
+	vmovaps	%xmm9,-0xa8(%rax)
-	movaps  %xmm10,-0x98(%rax)
+	vmovaps	%xmm10,-0x98(%rax)
-	movaps  %xmm11,-0x88(%rax)
+	vmovaps	%xmm11,-0x88(%rax)
-	movaps  %xmm12,-0x78(%rax)
+	vmovaps	%xmm12,-0x78(%rax)
-	movaps  %xmm13,-0x68(%rax)
+	vmovaps	%xmm13,-0x68(%rax)
-	movaps  %xmm14,-0x58(%rax)
+	vmovaps	%xmm14,-0x58(%rax)
-	movaps  %xmm15,-0x48(%rax)
+	vmovaps	%xmm15,-0x48(%rax)
 .Lsqr_1024_body:
 ___
 $code.=<<___;
 	mov	%rax,%rbp
 	vzeroall
 	mov	%rdx, $np			# reassigned argument
 	sub	\$$FrameSize, %rsp
 	mov	$np, $tmp
@@ -171,6 +183,7 @@ $code.=<<___;
 	and	\$4095, $tmp			# see if $np crosses page
 	add	\$32*10, $tmp
 	shr	\$12, $tmp
 	vpxor	$ACC9,$ACC9,$ACC9
 	jz	.Lsqr_1024_no_n_copy
 	# unaligned 256-bit load that crosses page boundary can
@@ -198,7 +211,7 @@ $code.=<<___;
 	vmovdqu		$ACC6, 32*6-128($np)
 	vmovdqu		$ACC7, 32*7-128($np)
 	vmovdqu		$ACC8, 32*8-128($np)
-	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
+	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
 .Lsqr_1024_no_n_copy:
 	and		\$-1024, %rsp
@@ -876,17 +889,18 @@ rsaz_1024_mul_avx2:
 	push	%r15
 ___
 $code.=<<___ if ($win64);
 	vzeroupper
 	lea	-0xa8(%rsp),%rsp
-	movaps  %xmm6,-0xd8(%rax)
+	vmovaps	%xmm6,-0xd8(%rax)
-	movaps  %xmm7,-0xc8(%rax)
+	vmovaps	%xmm7,-0xc8(%rax)
-	movaps  %xmm8,-0xb8(%rax)
+	vmovaps	%xmm8,-0xb8(%rax)
-	movaps  %xmm9,-0xa8(%rax)
+	vmovaps	%xmm9,-0xa8(%rax)
-	movaps  %xmm10,-0x98(%rax)
+	vmovaps	%xmm10,-0x98(%rax)
-	movaps  %xmm11,-0x88(%rax)
+	vmovaps	%xmm11,-0x88(%rax)
-	movaps  %xmm12,-0x78(%rax)
+	vmovaps	%xmm12,-0x78(%rax)
-	movaps  %xmm13,-0x68(%rax)
+	vmovaps	%xmm13,-0x68(%rax)
-	movaps  %xmm14,-0x58(%rax)
+	vmovaps	%xmm14,-0x58(%rax)
-	movaps  %xmm15,-0x48(%rax)
+	vmovaps	%xmm15,-0x48(%rax)
 .Lmul_1024_body:
 ___
 $code.=<<___;
@@ -900,6 +914,7 @@ $code.=<<___;
 	# cross page boundary, swap it with $bp [meaning that caller
 	# is advised to lay down $ap and $bp next to each other, so
 	# that only one can cross page boundary].
 	.byte	0x67,0x67
 	mov	$ap, $tmp
 	and	\$4095, $tmp
 	add	\$32*10, $tmp
@@ -915,6 +930,7 @@ $code.=<<___;
 	and	\$4095, $tmp	# see if $np crosses page
 	add	\$32*10, $tmp
 	.byte	0x67,0x67
 	shr	\$12, $tmp
 	jz	.Lmul_1024_no_n_copy
@@ -960,6 +976,7 @@ $code.=<<___;
 	vpbroadcastq ($bp), $Bi
 	vmovdqu	$ACC0, (%rsp)			# clear top of stack
 	xor	$r0, $r0
 	.byte	0x67
 	xor	$r1, $r1
 	xor	$r2, $r2
 	xor	$r3, $r3
@@ -1564,22 +1581,22 @@ rsaz_1024_gather5_avx2:
 ___
 $code.=<<___ if ($win64);
 	lea	-0x88(%rsp),%rax
 	vzeroupper
 .LSEH_begin_rsaz_1024_gather5:
 	# I can't trust assembler to use specific encoding:-(
 	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
-	.byte	0x0f,0x29,0x70,0xe0		#movaps	%xmm6,-0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6,-0x20(%rax)
-	.byte	0x0f,0x29,0x78,0xf0		#movaps	%xmm7,-0x10(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7,-0x10(%rax)
-	.byte	0x44,0x0f,0x29,0x00		#movaps	%xmm8,0(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8,0(%rax)
-	.byte	0x44,0x0f,0x29,0x48,0x10	#movaps	%xmm9,0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9,0x10(%rax)
-	.byte	0x44,0x0f,0x29,0x50,0x20	#movaps	%xmm10,0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10,0x20(%rax)
-	.byte	0x44,0x0f,0x29,0x58,0x30	#movaps	%xmm11,0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11,0x30(%rax)
-	.byte	0x44,0x0f,0x29,0x60,0x40	#movaps	%xmm12,0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12,0x40(%rax)
-	.byte	0x44,0x0f,0x29,0x68,0x50	#movaps	%xmm13,0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13,0x50(%rax)
-	.byte	0x44,0x0f,0x29,0x70,0x60	#movaps	%xmm14,0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14,0x60(%rax)
-	.byte	0x44,0x0f,0x29,0x78,0x70	#movaps	%xmm15,0x70(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15,0x70(%rax)
 ___
 $code.=<<___;
 	vzeroupper
 	lea	.Lgather_table(%rip),%r11
 	mov	$power,%eax
 	and	\$3,$power
@@ -1596,25 +1613,25 @@ $code.=<<___;
 	vpbroadcastb	2(%r11,%rax), %xmm14
 	vpbroadcastb	1(%r11,%rax), %xmm15
-	lea	($inp,$power),$inp
+	lea	64($inp,$power),$inp
 	mov	\$64,%r11			# size optimization
 	mov	\$9,%eax
 	jmp	.Loop_gather_1024
 .align	32
 .Loop_gather_1024:
-	vpand		($inp),			%xmm8,%xmm0
+	vpand		-64($inp),		%xmm8,%xmm0
-	vpand		($inp,%r11),		%xmm9,%xmm1
+	vpand		($inp),			%xmm9,%xmm1
-	vpand		($inp,%r11,2),		%xmm10,%xmm2
+	vpand		64($inp),		%xmm10,%xmm2
-	vpand		64($inp,%r11,2),	%xmm11,%xmm3
+	vpand		($inp,%r11,2),		%xmm11,%xmm3
 	 vpor					%xmm0,%xmm1,%xmm1
-	vpand		($inp,%r11,4),		%xmm12,%xmm4
+	vpand		64($inp,%r11,2),	%xmm12,%xmm4
 	 vpor					%xmm2,%xmm3,%xmm3
-	vpand		64($inp,%r11,4),	%xmm13,%xmm5
+	vpand		($inp,%r11,4),		%xmm13,%xmm5
 	 vpor					%xmm1,%xmm3,%xmm3
-	vpand		-128($inp,%r11,8),	%xmm14,%xmm6
+	vpand		64($inp,%r11,4),	%xmm14,%xmm6
 	 vpor					%xmm4,%xmm5,%xmm5
-	vpand		-64($inp,%r11,8),	%xmm15,%xmm2
+	vpand		-128($inp,%r11,8),	%xmm15,%xmm2
 	lea		($inp,%r11,8),$inp
 	 vpor					%xmm3,%xmm5,%xmm5
 	 vpor					%xmm2,%xmm6,%xmm6
@@ -1798,16 +1815,16 @@ rsaz_se_handler:
 	.rva	.Lmul_1024_body,.Lmul_1024_epilogue
 .LSEH_info_rsaz_1024_gather5:
 	.byte	0x01,0x33,0x16,0x00
-	.byte	0x33,0xf8,0x09,0x00	#movaps 0x90(rsp),xmm15
+	.byte	0x36,0xf8,0x09,0x00	#vmovaps 0x90(rsp),xmm15
-	.byte	0x2e,0xe8,0x08,0x00	#movaps 0x80(rsp),xmm14
+	.byte	0x31,0xe8,0x08,0x00	#vmovaps 0x80(rsp),xmm14
-	.byte	0x29,0xd8,0x07,0x00	#movaps 0x70(rsp),xmm13
+	.byte	0x2c,0xd8,0x07,0x00	#vmovaps 0x70(rsp),xmm13
-	.byte	0x24,0xc8,0x06,0x00	#movaps 0x60(rsp),xmm12
+	.byte	0x27,0xc8,0x06,0x00	#vmovaps 0x60(rsp),xmm12
-	.byte	0x1f,0xb8,0x05,0x00	#movaps 0x50(rsp),xmm11
+	.byte	0x22,0xb8,0x05,0x00	#vmovaps 0x50(rsp),xmm11
-	.byte	0x1a,0xa8,0x04,0x00	#movaps 0x40(rsp),xmm10
+	.byte	0x1d,0xa8,0x04,0x00	#vmovaps 0x40(rsp),xmm10
-	.byte	0x15,0x98,0x03,0x00	#movaps 0x30(rsp),xmm9
+	.byte	0x18,0x98,0x03,0x00	#vmovaps 0x30(rsp),xmm9
-	.byte	0x10,0x88,0x02,0x00	#movaps 0x20(rsp),xmm8
+	.byte	0x13,0x88,0x02,0x00	#vmovaps 0x20(rsp),xmm8
-	.byte	0x0c,0x78,0x01,0x00	#movaps 0x10(rsp),xmm7
+	.byte	0x0e,0x78,0x01,0x00	#vmovaps 0x10(rsp),xmm7
-	.byte	0x08,0x68,0x00,0x00	#movaps 0x00(rsp),xmm6
+	.byte	0x09,0x68,0x00,0x00	#vmovaps 0x00(rsp),xmm6
 	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
 ___
 }
--- a/crypto/bn/asm/rsaz-x86_64.pl
+++ b/crypto/bn/asm/rsaz-x86_64.pl
@@ -1,48 +1,60 @@
 #!/usr/bin/env perl
-#******************************************************************************#
+##############################################################################
-#* Copyright(c) 2012, Intel Corp.                                             *#
+#                                                                            #
-#* Developers and authors:                                                    *#
+#  Copyright (c) 2012, Intel Corporation                                     #
-#* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *#
+#                                                                            #
-#* (1) Intel Architecture Group, Microprocessor and Chipset Development,      *#
+#  All rights reserved.                                                      #
-#*     Israel Development Center, Haifa, Israel                               *#
+#                                                                            #
-#* (2) University of Haifa                                                    *#
+#  Redistribution and use in source and binary forms, with or without        #
-#******************************************************************************#
+#  modification, are permitted provided that the following conditions are    #
-#* This submission to OpenSSL is to be made available under the OpenSSL       *#
+#  met:                                                                      #
-#* license, and only to the OpenSSL project, in order to allow integration    *#
+#                                                                            #
-#* into the publicly distributed code. ?                                      *#
+#  *  Redistributions of source code must retain the above copyright         #
-#* The use of this code, or portions of this code, or concepts embedded in    *#
+#     notice, this list of conditions and the following disclaimer.          #
-#* this code, or modification of this code and/or algorithm(s) in it, or the  *#
+#                                                                            #
-#* use of this code for any other purpose than stated above, requires special *#
+#  *  Redistributions in binary form must reproduce the above copyright      #
-#* licensing.                                                                 *#
+#     notice, this list of conditions and the following disclaimer in the    #
-#******************************************************************************#
+#     documentation and/or other materials provided with the                 #
-#******************************************************************************#
+#     distribution.                                                          #
-#* DISCLAIMER:                                                                *#
+#                                                                            #
-#* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     *#
+#  *  Neither the name of the Intel Corporation nor the names of its         #
-#* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *#
+#     contributors may be used to endorse or promote products derived from   #
-#* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *#
+#     this software without specific prior written permission.               #
-#* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT*#
+#                                                                            #
-#* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, *#
+#                                                                            #
-#* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    *#
+#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
-#* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   *#
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
-#* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    *#
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
-#* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    *#
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
-#* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE *#
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
-#* POSSIBILITY OF SUCH DAMAGE.                                                *#
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
-#******************************************************************************#
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
-#* Reference:                                                                 *#
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
-#* [1] S. Gueron, "Efficient Software Implementations of Modular              *#
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
-#*     Exponentiation", http://eprint.iacr.org/2011/239                       *#
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
-#* [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             *#
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
-#*     IEEE Proceedings of 9th International Conference on Information        *#
+#                                                                            #
-#*     Technology: New Generations (ITNG 2012), 821-823 (2012).               *#
+##############################################################################
-#* [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation*#
+# Developers and authors:                                                    #
-#*     Journal of Cryptographic Engineering 2:31-43 (2012).                   *#
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
-#* [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    *#
+# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
-#*     resistant 512-bit and 1024-bit modular exponentiation for optimizing   *#
+#     Israel Development Center, Haifa, Israel                               #
-#*     RSA1024 and RSA2048 on x86_64 platforms",                              *#
+# (2) University of Haifa                                                    #
-#*     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest*#
+##############################################################################
-################################################################################
+# Reference:                                                                 #
 # [1] S. Gueron, "Efficient Software Implementations of Modular              #
 #     Exponentiation", http://eprint.iacr.org/2011/239                       #
 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
 #     IEEE Proceedings of 9th International Conference on Information        #
 #     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
 # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
 #     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
 # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
 #     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
 #     RSA1024 and RSA2048 on x86_64 platforms",                              #
 #     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
 ##############################################################################
 # While original submission covers 512- and 1024-bit exponentiation,
 # this module is limited to 512-bit version only (and as such
@@ -1812,33 +1824,33 @@ $code.=<<___;
 .align	32
 __rsaz_512_mulx:
 	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
-	xor	$zero, $zero		# cf=0,of=0
+	mov	\$-6, %rcx
 	mulx	8($ap), %rax, %r9
 	movq	%rbx, 8(%rsp)
 	mulx	16($ap), %rbx, %r10
-	adcx	%rax, %r8
+	adc	%rax, %r8
 	mulx	24($ap), %rax, %r11
-	adcx	%rbx, %r9
+	adc	%rbx, %r9
-	.byte	0xc4,0x62,0xe3,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rbx, %r12
+	mulx	32($ap), %rbx, %r12
-	adcx	%rax, %r10
+	adc	%rax, %r10
 	mulx	40($ap), %rax, %r13
-	adcx	%rbx, %r11
+	adc	%rbx, %r11
 	mulx	48($ap), %rbx, %r14
-	adcx	%rax, %r12
+	adc	%rax, %r12
 	mulx	56($ap), %rax, %r15
 	 mov	8($bp), %rdx
-	adcx	%rbx, %r13
+	adc	%rbx, %r13
-	adcx	%rax, %r14
+	adc	%rax, %r14
-	adcx	$zero, %r15		# cf=0
+	adc	\$0, %r15
-	mov	\$-6, %rcx
+	xor	$zero, $zero		# cf=0,of=0
 	jmp	.Loop_mulx
 .align	32
--- a/crypto/bn/rsaz_exp.c
+++ b/crypto/bn/rsaz_exp.c
@@ -1,306 +1,318 @@
-/******************************************************************************
+/*****************************************************************************
-* Copyright(c) 2012, Intel Corp.                                             
+*                                                                            *
-* Developers and authors:                                                    
+*  Copyright (c) 2012, Intel Corporation                                     *
-* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
+*                                                                            *
-* (1) Intel Corporation, Israel Development Center, Haifa, Israel                               
+*  All rights reserved.                                                      *
-* (2) University of Haifa, Israel                                              
+*                                                                            *
-******************************************************************************
+*  Redistribution and use in source and binary forms, with or without        *
-* LICENSE:                                                                
+*  modification, are permitted provided that the following conditions are    *
-* This submission to OpenSSL is to be made available under the OpenSSL  
+*  met:                                                                      *
-* license, and only to the OpenSSL project, in order to allow integration    
+*                                                                            *
-* into the publicly distributed code. 
+*  *  Redistributions of source code must retain the above copyright         *
-* The use of this code, or portions of this code, or concepts embedded in
+*     notice, this list of conditions and the following disclaimer.          *
-* this code, or modification of this code and/or algorithm(s) in it, or the
+*                                                                            *
-* use of this code for any other purpose than stated above, requires special
+*  *  Redistributions in binary form must reproduce the above copyright      *
-* licensing.                                                                  
+*     notice, this list of conditions and the following disclaimer in the    *
-******************************************************************************
+*     documentation and/or other materials provided with the                 *
-* DISCLAIMER:                                                                
+*     distribution.                                                          *
-* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
+*                                                                            *
-* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
+*  *  Neither the name of the Intel Corporation nor the names of its         *
-* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+*     contributors may be used to endorse or promote products derived from   *
-* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
+*     this software without specific prior written permission.               *
-* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
+*                                                                            *
-* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
+*                                                                            *
-* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
+*  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
-* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
+*  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
-* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
+*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
-* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+*  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
-* POSSIBILITY OF SUCH DAMAGE.                                                
+*  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
-******************************************************************************/
+*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
-
+*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
-#include "rsaz_exp.h"
+*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
-
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
-/*
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
- * See crypto/bn/asm/rsaz-avx2.pl for further details.
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
- */
+*                                                                            *
-void rsaz_1024_norm2red_avx2(void *red,const void *norm);
+******************************************************************************
-void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k);
+* Developers and authors:                                                    *
-void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt);
+* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
-void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
-void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
+* (2) University of Haifa, Israel                                            *
-void rsaz_1024_red2norm_avx2(void *norm,const void *red);
+*****************************************************************************/
-
+
-#if defined(__GNUC__)
+#include "rsaz_exp.h"
-# define ALIGN64	__attribute__((aligned(64)))
+
-#elif defined(_MSC_VER)
+/*
-# define ALIGN64	__declspec(align(64))
+ * See crypto/bn/asm/rsaz-avx2.pl for further details.
-#elif defined(__SUNPRO_C)
+ */
-# define ALIGN64
+void rsaz_1024_norm2red_avx2(void *red,const void *norm);
-# pragma align 64(one,two80)
+void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,unsigned long k);
-#else
+void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,unsigned long k,int cnt);
-# define ALIGN64	/* not fatal, might hurt performance a little */
+void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
-#endif
+void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
-
+void rsaz_1024_red2norm_avx2(void *norm,const void *red);
-ALIGN64 static const unsigned long one[40] =
+
-	{1,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+#if defined(__GNUC__)
-ALIGN64 static const unsigned long two80[40] =
+# define ALIGN64	__attribute__((aligned(64)))
-	{0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+#elif defined(_MSC_VER)
-
+# define ALIGN64	__declspec(align(64))
-void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
+#elif defined(__SUNPRO_C)
-	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
+# define ALIGN64
-	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
+# pragma align 64(one,two80)
-{
+#else
-	unsigned char	 storage[320*3+32*9*16+64];	/* 5.5KB */
+# define ALIGN64	/* not fatal, might hurt performance a little */
-	unsigned char	*p_str = storage + (64-((size_t)storage%64));
+#endif
-	unsigned char	*a_inv, *m, *result,
+
-			*table_s = p_str+320*3,
+ALIGN64 static const unsigned long one[40] =
-			*R2      = table_s;	/* borrow */
+	{1,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-	int index;
+ALIGN64 static const unsigned long two80[40] =
-	int wvalue;
+	{0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
+
-	if ((((size_t)p_str&4095)+320)>>12) {
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
-		result = p_str;
+	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
-		a_inv = p_str + 320;
+	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
-		m = p_str + 320*2;	/* should not cross page */
+{
-	} else {
+	unsigned char	 storage[320*3+32*9*16+64];	/* 5.5KB */
-		m = p_str;		/* should not cross page */
+	unsigned char	*p_str = storage + (64-((size_t)storage%64));
-		result = p_str + 320;
+	unsigned char	*a_inv, *m, *result,
-		a_inv = p_str + 320*2;
+			*table_s = p_str+320*3,
-	}
+			*R2      = table_s;	/* borrow */
-
+	int index;
-	rsaz_1024_norm2red_avx2(m, m_norm);
+	int wvalue;
-	rsaz_1024_norm2red_avx2(a_inv, base_norm);
+
-	rsaz_1024_norm2red_avx2(R2, RR);
+	if ((((size_t)p_str&4095)+320)>>12) {
-
+		result = p_str;
-	rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+		a_inv = p_str + 320;
-	rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+		m = p_str + 320*2;	/* should not cross page */
-
+	} else {
-	/* table[0] = 1 */
+		m = p_str;		/* should not cross page */
-	rsaz_1024_mul_avx2(result, R2, one, m, k0);
+		result = p_str + 320;
-	/* table[1] = a_inv^1 */
+		a_inv = p_str + 320*2;
-	rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+	}
-
+
-	rsaz_1024_scatter5_avx2(table_s,result,0);
+	rsaz_1024_norm2red_avx2(m, m_norm);
-	rsaz_1024_scatter5_avx2(table_s,a_inv,1);
+	rsaz_1024_norm2red_avx2(a_inv, base_norm);
-
+	rsaz_1024_norm2red_avx2(R2, RR);
-	/* table[2] = a_inv^2 */
+
-	rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+	rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
-	rsaz_1024_scatter5_avx2(table_s,result,2);
+	rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
-#if 0
+
-	/* this is almost 2x smaller and less than 1% slower */
+	/* table[0] = 1 */
-	for (index=3; index<32; index++) {
+	rsaz_1024_mul_avx2(result, R2, one, m, k0);
-		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	/* table[1] = a_inv^1 */
-		rsaz_1024_scatter5_avx2(table_s,result,index);
+	rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
-	}
+
-#else
+	rsaz_1024_scatter5_avx2(table_s,result,0);
-	/* table[4] = a_inv^4 */
+	rsaz_1024_scatter5_avx2(table_s,a_inv,1);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+
-	rsaz_1024_scatter5_avx2(table_s,result,4);
+	/* table[2] = a_inv^2 */
-	/* table[8] = a_inv^8 */
+	rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,2);
-	rsaz_1024_scatter5_avx2(table_s,result,8);
+#if 0
-	/* table[16] = a_inv^16 */
+	/* this is almost 2x smaller and less than 1% slower */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	for (index=3; index<32; index++) {
-	rsaz_1024_scatter5_avx2(table_s,result,16);
+		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	/* table[17] = a_inv^17 */
+		rsaz_1024_scatter5_avx2(table_s,result,index);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	}
-	rsaz_1024_scatter5_avx2(table_s,result,17);
+#else
-
+	/* table[4] = a_inv^4 */
-	/* table[3] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_gather5_avx2(result,table_s,2);
+	rsaz_1024_scatter5_avx2(table_s,result,4);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	/* table[8] = a_inv^8 */
-	rsaz_1024_scatter5_avx2(table_s,result,3);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[6] */
+	rsaz_1024_scatter5_avx2(table_s,result,8);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[16] = a_inv^16 */
-	rsaz_1024_scatter5_avx2(table_s,result,6);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[12] */
+	rsaz_1024_scatter5_avx2(table_s,result,16);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[17] = a_inv^17 */
-	rsaz_1024_scatter5_avx2(table_s,result,12);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
- 	/* table[24] */
+	rsaz_1024_scatter5_avx2(table_s,result,17);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+
-	rsaz_1024_scatter5_avx2(table_s,result,24);
+	/* table[3] */
-	/* table[25] */
+	rsaz_1024_gather5_avx2(result,table_s,2);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_scatter5_avx2(table_s,result,25);
+	rsaz_1024_scatter5_avx2(table_s,result,3);
-
+	/* table[6] */
-	/* table[5] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	rsaz_1024_gather5_avx2(result,table_s,4);
+	rsaz_1024_scatter5_avx2(table_s,result,6);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	/* table[12] */
-	rsaz_1024_scatter5_avx2(table_s,result,5);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[10] */
+	rsaz_1024_scatter5_avx2(table_s,result,12);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+ 	/* table[24] */
-	rsaz_1024_scatter5_avx2(table_s,result,10);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[20] */
+	rsaz_1024_scatter5_avx2(table_s,result,24);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[25] */
-	rsaz_1024_scatter5_avx2(table_s,result,20);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	/* table[21] */
+	rsaz_1024_scatter5_avx2(table_s,result,25);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
-	rsaz_1024_scatter5_avx2(table_s,result,21);
+	/* table[5] */
-
+	rsaz_1024_gather5_avx2(result,table_s,4);
-	/* table[7] */
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_gather5_avx2(result,table_s,6);
+	rsaz_1024_scatter5_avx2(table_s,result,5);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	/* table[10] */
-	rsaz_1024_scatter5_avx2(table_s,result,7);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[14] */
+	rsaz_1024_scatter5_avx2(table_s,result,10);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[20] */
-	rsaz_1024_scatter5_avx2(table_s,result,14);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[28] */
+	rsaz_1024_scatter5_avx2(table_s,result,20);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[21] */
-	rsaz_1024_scatter5_avx2(table_s,result,28);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-	/* table[29] */
+	rsaz_1024_scatter5_avx2(table_s,result,21);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
-	rsaz_1024_scatter5_avx2(table_s,result,29);
+	/* table[7] */
-
+	rsaz_1024_gather5_avx2(result,table_s,6);
-	/* table[9] */
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_gather5_avx2(result,table_s,8);
+	rsaz_1024_scatter5_avx2(table_s,result,7);
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	/* table[14] */
-	rsaz_1024_scatter5_avx2(table_s,result,9);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[18] */
+	rsaz_1024_scatter5_avx2(table_s,result,14);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[28] */
-	rsaz_1024_scatter5_avx2(table_s,result,18);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[19] */
+	rsaz_1024_scatter5_avx2(table_s,result,28);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	/* table[29] */
-	rsaz_1024_scatter5_avx2(table_s,result,19);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-
+	rsaz_1024_scatter5_avx2(table_s,result,29);
-	/* table[11] */
+
-	rsaz_1024_gather5_avx2(result,table_s,10);
+	/* table[9] */
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_gather5_avx2(result,table_s,8);
-	rsaz_1024_scatter5_avx2(table_s,result,11);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	/* table[22] */
+	rsaz_1024_scatter5_avx2(table_s,result,9);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[18] */
-	rsaz_1024_scatter5_avx2(table_s,result,22);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[23] */
+	rsaz_1024_scatter5_avx2(table_s,result,18);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	/* table[19] */
-	rsaz_1024_scatter5_avx2(table_s,result,23);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-
+	rsaz_1024_scatter5_avx2(table_s,result,19);
-	/* table[13] */
+
-	rsaz_1024_gather5_avx2(result,table_s,12);
+	/* table[11] */
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_gather5_avx2(result,table_s,10);
-	rsaz_1024_scatter5_avx2(table_s,result,13);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	/* table[26] */
+	rsaz_1024_scatter5_avx2(table_s,result,11);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[22] */
-	rsaz_1024_scatter5_avx2(table_s,result,26);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[27] */
+	rsaz_1024_scatter5_avx2(table_s,result,22);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	/* table[23] */
-	rsaz_1024_scatter5_avx2(table_s,result,27);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-
+	rsaz_1024_scatter5_avx2(table_s,result,23);
-	/* table[15] */
+
-	rsaz_1024_gather5_avx2(result,table_s,14);
+	/* table[13] */
-	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_gather5_avx2(result,table_s,12);
-	rsaz_1024_scatter5_avx2(table_s,result,15);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	/* table[30] */
+	rsaz_1024_scatter5_avx2(table_s,result,13);
-	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	/* table[26] */
-	rsaz_1024_scatter5_avx2(table_s,result,30);
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-	/* table[31] */
+	rsaz_1024_scatter5_avx2(table_s,result,26);
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	/* table[27] */
-	rsaz_1024_scatter5_avx2(table_s,result,31);
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-#endif
+	rsaz_1024_scatter5_avx2(table_s,result,27);
-
+
-	/* load first window */
+	/* table[15] */
-	p_str = (unsigned char*)exponent;
+	rsaz_1024_gather5_avx2(result,table_s,14);
-	wvalue = p_str[127] >> 3;
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
-	rsaz_1024_gather5_avx2(result,table_s,wvalue);
+	rsaz_1024_scatter5_avx2(table_s,result,15);
-
+	/* table[30] */
-	index = 1014;
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
-
+	rsaz_1024_scatter5_avx2(table_s,result,30);
-	while(index > -1) {	/* loop for the remaining 127 windows */
+	/* table[31] */
-
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-		rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+	rsaz_1024_scatter5_avx2(table_s,result,31);
-
+#endif
-		wvalue = *((unsigned short*)&p_str[index/8]);
+
-		wvalue = (wvalue>> (index%8)) & 31;
+	/* load first window */
-		index-=5;
+	p_str = (unsigned char*)exponent;
-
+	wvalue = p_str[127] >> 3;
-		rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
+	rsaz_1024_gather5_avx2(result,table_s,wvalue);
-		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
-	}
+	index = 1014;
-
+
-	/* square four times */
+	while(index > -1) {	/* loop for the remaining 127 windows */
-	rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+
-
+		rsaz_1024_sqr_avx2(result, result, m, k0, 5);
-	wvalue = p_str[0] & 15;
+
-
+		wvalue = *((unsigned short*)&p_str[index/8]);
-	rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
+		wvalue = (wvalue>> (index%8)) & 31;
-	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+		index-=5;
-
+
-	/* from Montgomery */
+		rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
-	rsaz_1024_mul_avx2(result, result, one, m, k0);
+		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-
+	}
-	rsaz_1024_red2norm_avx2(result_norm, result);
+
-
+	/* square four times */
-	OPENSSL_cleanse(storage,sizeof(storage));
+	rsaz_1024_sqr_avx2(result, result, m, k0, 4);
-}
+
-
+	wvalue = p_str[0] & 15;
-/*
+
- * See crypto/bn/rsaz-x86_64.pl for further details.
+	rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
- */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
-void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k);
+
-void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power);
+	/* from Montgomery */
-void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power);
+	rsaz_1024_mul_avx2(result, result, one, m, k0);
-void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k);
+
-void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt);
+	rsaz_1024_red2norm_avx2(result_norm, result);
-void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power);
+
-void rsaz_512_gather4(unsigned long *val, const void *tbl, int power);
+	OPENSSL_cleanse(storage,sizeof(storage));
-
+}
-void RSAZ_512_mod_exp(BN_ULONG result[8],
+
-	const BN_ULONG base[8], const BN_ULONG exponent[8],
+/*
-	const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
+ * See crypto/bn/rsaz-x86_64.pl for further details.
-{
+ */
-	unsigned char	 storage[16*8*8+64*2+64];	/* 1.2KB */
+void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,unsigned long k);
-	unsigned char	*table = storage + (64-((size_t)storage%64));
+void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,unsigned long k,const void *tbl,unsigned int power);
-	unsigned long	*a_inv = (unsigned long *)(table+16*8*8),
+void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,unsigned long k,unsigned int power);
-			*temp  = (unsigned long *)(table+16*8*8+8*8);
+void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,unsigned long k);
-	unsigned char	*p_str = (unsigned char*)exponent;
+void rsaz_512_sqr(void *ret,const void *a,const void *n,unsigned long k,int cnt);
-	int index;
+void rsaz_512_scatter4(void *tbl, const unsigned long *val, int power);
-	unsigned int wvalue;
+void rsaz_512_gather4(unsigned long *val, const void *tbl, int power);
-
+
-	/* table[0] = 1_inv */
+void RSAZ_512_mod_exp(BN_ULONG result[8],
-	temp[0] = 0-m[0];	temp[1] = ~m[1];
+	const BN_ULONG base[8], const BN_ULONG exponent[8],
-	temp[2] = ~m[2];	temp[3] = ~m[3];
+	const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
-	temp[4] = ~m[4];	temp[5] = ~m[5];
+{
-	temp[6] = ~m[6];	temp[7] = ~m[7];
+	unsigned char	 storage[16*8*8+64*2+64];	/* 1.2KB */
-	rsaz_512_scatter4(table, temp, 0);
+	unsigned char	*table = storage + (64-((size_t)storage%64));
-
+	unsigned long	*a_inv = (unsigned long *)(table+16*8*8),
-	/* table [1] = a_inv^1 */
+			*temp  = (unsigned long *)(table+16*8*8+8*8);
-	rsaz_512_mul(a_inv, base, RR, m, k0);
+	unsigned char	*p_str = (unsigned char*)exponent;
-	rsaz_512_scatter4(table, a_inv, 1);
+	int index;
-
+	unsigned int wvalue;
-	/* table [2] = a_inv^2 */
+
-	rsaz_512_sqr(temp, a_inv, m, k0, 1);
+	/* table[0] = 1_inv */
-	rsaz_512_scatter4(table, temp, 2);
+	temp[0] = 0-m[0];	temp[1] = ~m[1];
-
+	temp[2] = ~m[2];	temp[3] = ~m[3];
-	for (index=3; index<16; index++)
+	temp[4] = ~m[4];	temp[5] = ~m[5];
-		rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
+	temp[6] = ~m[6];	temp[7] = ~m[7];
-
+	rsaz_512_scatter4(table, temp, 0);
-	/* load first window */
+
-	wvalue = p_str[63];
+	/* table [1] = a_inv^1 */
-
+	rsaz_512_mul(a_inv, base, RR, m, k0);
-	rsaz_512_gather4(temp, table, wvalue>>4);
+	rsaz_512_scatter4(table, a_inv, 1);
-	rsaz_512_sqr(temp, temp, m, k0, 4);
+
-	rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
+	/* table [2] = a_inv^2 */
-
+	rsaz_512_sqr(temp, a_inv, m, k0, 1);
-	for (index=62; index>=0; index--) {
+	rsaz_512_scatter4(table, temp, 2);
-		wvalue = p_str[index];
+
-
+	for (index=3; index<16; index++)
-		rsaz_512_sqr(temp, temp, m, k0, 4);
+		rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
-		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
+
-
+	/* load first window */
-		rsaz_512_sqr(temp, temp, m, k0, 4);
+	wvalue = p_str[63];
-		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
+
-	}
+	rsaz_512_gather4(temp, table, wvalue>>4);
-
+	rsaz_512_sqr(temp, temp, m, k0, 4);
-	/* from Montgomery */
+	rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
-	rsaz_512_mul_by_one(result, temp, m, k0);
+
-
+	for (index=62; index>=0; index--) {
-	OPENSSL_cleanse(storage,sizeof(storage));
+		wvalue = p_str[index];
-}
+
 		rsaz_512_sqr(temp, temp, m, k0, 4);
 		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
 		rsaz_512_sqr(temp, temp, m, k0, 4);
 		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
 	}
 	/* from Montgomery */
 	rsaz_512_mul_by_one(result, temp, m, k0);
 	OPENSSL_cleanse(storage,sizeof(storage));
 }