From a5bb5bca52f57021a4017521c55a6b3590bbba7a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 3 Oct 2013 00:45:04 +0200 Subject: [PATCH] bn/asm/x86_64-mont*.pl: add MULX/ADCX/ADOX code path. --- crypto/bn/asm/x86_64-mont.pl | 1232 ++++++++++++++++++++++++++++++++- crypto/bn/asm/x86_64-mont5.pl | 426 ++++++++++++ 2 files changed, 1621 insertions(+), 37 deletions(-) diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index ba938d6cc..843a102c8 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -31,10 +31,14 @@ # June 2013. # -# Optmize reduction in squaring procedure and improve 1024+-bit RSA +# Optimize reduction in squaring procedure and improve 1024+-bit RSA # sign performance by 10-16% on Intel Sandy Bridge and later # (virtually same on non-Intel processors). +# August 2013. +# +# Add MULX/ADOX/ADCX code path. + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -49,6 +53,21 @@ die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.22); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=11); +} + # int bn_mul_mont( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, @@ -67,6 +86,8 @@ $m1="%rbp"; $code=<<___; .text +.extern OPENSSL_ia32cap_P + .globl bn_mul_mont .type bn_mul_mont,\@function,6 .align 16 @@ -75,6 +96,11 @@ bn_mul_mont: jnz .Lmul_enter cmp \$8,${num}d jb .Lmul_enter +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d +___ +$code.=<<___; cmp $ap,$bp jne .Lmul4x_enter test \$7,${num}d @@ -288,6 +314,13 @@ $code.=<<___; .align 16 bn_mul4x_mont: .Lmul4x_enter: +___ +$code.=<<___ if ($addx); + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lmulx4x_enter +___ +$code.=<<___; push %rbx push %rbp push %r12 @@ -714,6 +747,13 @@ $code.=<<___; .align 32 bn_sqr8x_mont: .Lsqr8x_enter: +___ +$code.=<<___ if ($addx); + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lsqrx8x_enter +___ +$code.=<<___; push %rbx push %rbp push %r12 @@ -1122,44 +1162,45 @@ $code.=<<___; mov $carry,24($tptr) # t[7] mov -16($aptr,$i),%rax # a[0] - lea 64(%rsp,$num,2),$tptr + lea 64(%rsp),$tptr xor $A0[0],$A0[0] # t[0] - mov -24($tptr,$i,2),$A0[1] # t[1] + mov 8($tptr),$A0[1] # t[1] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 - mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf - mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch - mov $S[0],-32($tptr,$i,2) + mov $S[0],($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift - mov $S[1],-24($tptr,$i,2) + mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 - mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf - mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch - mov $S[2],-16($tptr,$i,2) + mov $S[2],16($tptr) adc %rdx,$S[3] lea 16($i),$i - mov $S[3],-40($tptr,$i,2) + mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry + lea 64($tptr),$tptr jmp .Lsqr4x_shift_n_add .align 32 @@ -1169,68 +1210,69 @@ $code.=<<___; lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 - mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf - mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch - mov $S[0],-32($tptr,$i,2) + mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift - mov $S[1],-24($tptr,$i,2) + mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 - mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov 0($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf - mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch - mov $S[2],-16($tptr,$i,2) + mov $S[2],-16($tptr) adc %rdx,$S[3] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift - mov $S[3],-8($tptr,$i,2) + mov $S[3],-8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 - mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf - mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov 8($aptr,$i),%rax # a[i+1] # prefetch - mov $S[0],0($tptr,$i,2) + mov $S[0],0($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift - mov $S[1],8($tptr,$i,2) + mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 - mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch + mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf - mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch + mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 16($aptr,$i),%rax # a[i+1] # prefetch - mov $S[2],16($tptr,$i,2) + mov $S[2],16($tptr) adc %rdx,$S[3] - mov $S[3],24($tptr,$i,2) + mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry + lea 64($tptr),$tptr add \$32,$i jnz .Lsqr4x_shift_n_add @@ -1548,14 +1590,15 @@ $code.=<<___; jmp .Lsqr4x_sub .align 32 .Lsqr4x_sub: - mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[0],0($rptr) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rptr) # rp[i]=tp[i]-np[i] sbb 16($nptr,$i,8),@ri[2] mov 32($tptr,$i,8),@ri[0] # tp[i+1] mov 40($tptr,$i,8),@ri[1] sbb 24($nptr,$i,8),@ri[3] - mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] - mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[2],16($rptr) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rptr) # rp[i]=tp[i]-np[i] + lea 32($rptr),$rptr sbb 32($nptr,$i,8),@ri[0] mov 48($tptr,$i,8),@ri[2] mov 56($tptr,$i,8),@ri[3] @@ -1564,15 +1607,16 @@ $code.=<<___; dec $j # doesn't affect CF! jnz .Lsqr4x_sub - mov @ri[0],0($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[0],0($rptr) # rp[i]=tp[i]-np[i] mov 32($tptr,$i,8),@ri[0] # load overflow bit sbb 16($nptr,$i,8),@ri[2] - mov @ri[1],8($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[1],8($rptr) # rp[i]=tp[i]-np[i] sbb 24($nptr,$i,8),@ri[3] - mov @ri[2],16($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[2],16($rptr) # rp[i]=tp[i]-np[i] sbb \$0,@ri[0] # handle upmost overflow bit - mov @ri[3],24($rptr,$i,8) # rp[i]=tp[i]-np[i] + mov @ri[3],24($rptr) # rp[i]=tp[i]-np[i] + mov 32(%rsp),$rptr # restore $rptr xor $i,$i # i=0 and @ri[0],$tptr not @ri[0] @@ -1624,6 +1668,1094 @@ $code.=<<___; .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ }}} + +if ($addx) {{{ +my $bp="%rdx"; # original value + +$code.=<<___; +.type bn_mulx4x_mont,\@function,6 +.align 32 +bn_mulx4x_mont: +.Lmulx4x_enter: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + shl \$3,${num}d # convert $num to bytes + xor %r10,%r10 + mov %rsp,%r11 # put aside %rsp + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) + lea ($bp,$num),%r10 + and \$-128,%rsp + ############################################################## + # Stack layout + # +0 num + # +8 off-loaded &b[i] + # +16 end of b[num] + # +24 saved n0 + # +32 saved rp + # +40 + # +48 inner counter + # +56 saved %rsp + # +64 tmp[num+1] + # + mov $num,0(%rsp) # save $num + shr \$5,$num + mov %r10,16(%rsp) # end of b[num] + sub \$1,$num + mov $n0, 24(%rsp) # save *n0 + mov $rp, 32(%rsp) # save $rp + mov $num,48(%rsp) # inner counter + mov %r11,56(%rsp) # save original %rsp + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: +___ +my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= + ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); +my $rptr=$bptr; +$code.=<<___; + lea 8($bp),$bptr + mov ($bp),%rdx # b[0], $bp==%rdx actually + lea 64+32(%rsp),$tptr + mov %rdx,$bi + xor $zero,$zero # of=0,cf=0 + + mulx 0*8($aptr),$mi,%rax # a[0]*b[0] + mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] + adcx %rax,%r11 + mov $bptr,8(%rsp) # off-load &b[i] + mulx 2*8($aptr),%r12,%r13 # ... + adcx %r14,%r12 + adcx $zero,%r13 + + mov $mi,$bptr # borrow $bptr + imulq 24(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + lea 4*8($aptr),$aptr + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,$bptr # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + mov 48(%rsp),$bptr # counter value + mov %r10,-4*8($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + .byte 0x66,0x66 + mov $bi,%rdx + mov %r11,-3*8($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + lea 4*8($nptr),$nptr + mov %r12,-2*8($tptr) + + #jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcx $zero,%r15 # cf=0, modulo-scheduled + mulx 0*8($aptr),%r10,%rax # a[4]*b[0] + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + .byte 0x66,0x66 + mov $mi,%rdx + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + adcx %rax,%r12 + mov %r11,-4*8($tptr) + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_1st + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + adc $zero,%r15 # modulo-scheduled + add %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + mov ($bptr),%rdx # b[i] + lea 8($bptr),$bptr + sub $num,$aptr # rewind $aptr + mov %r15,($tptr) # save top-most carry + mov 64(%rsp),%r10 + lea 64(%rsp),$tptr + sub $num,$nptr # rewind $nptr + xor $zero,$zero # cf=0, of=0 + mov %rdx,$bi + + mulx 0*8($aptr),$mi,%rax # a[0]*b[i] + adox %r10,$mi + mov 1*8($tptr),%r10 + mulx 1*8($aptr),%r11,%r14 # a[1]*b[i] + adcx %rax,%r11 + mov $bptr,8(%rsp) # off-load &b[i] + mulx 2*8($aptr),%r12,%r13 # ... + adox %r10,%r11 + adcx %r14,%r12 + adox $zero,%r12 + .byte 0x66,0x66 + adcx $zero,%r13 + mov 2*8($tptr),%r10 + + mov $mi,$bptr # borrow $bptr + imulq 24(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + adox %r10,%r12 + adcx %rax,%r13 + adox 3*8($tptr),%r13 + adcx $zero,%r14 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + adox $zero,%r14 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,$bptr # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + mov %r10,-4*8($tptr) + mov 0*8($tptr),%r10 + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + mov 48(%rsp),$bptr # counter value + .byte 0x66,0x3e + mov %r12,-2*8($tptr) + lea 4*8($nptr),$nptr + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + adcx $zero,%r15 # cf=0, modulo-scheduled + adox %r10,%r14 + mulx 0*8($aptr),%r10,%rax # a[4]*b[i] + mov 1*8($tptr),%r13 + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] + adox %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r13,%r11 + adox %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + mov $mi,%rdx + adcx 2*8($tptr),%r12 + adox %rax,%r13 + adcx 3*8($tptr),%r13 + adox $zero,%r14 # of=0 + .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr + .byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr + adcx $zero,%r14 # cf=0 + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + mov 0*8($tptr),%r10 + adcx %rax,%r12 + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-4*8($tptr) + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_inner + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + adc $zero,%r15 # modulo-scheduled + sub %r10,$zero # pull top-most carry + adc %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + + cmp 16(%rsp),$bptr + jne .Lmulx4x_outer + + neg $num + mov 32(%rsp),$rptr # restore rp + lea 64(%rsp),$tptr + + xor %rdx,%rdx + pxor %xmm0,%xmm0 + mov 0*8($nptr,$num),%r8 + mov 1*8($nptr,$num),%r9 + neg %r8 + jmp .Lmulx4x_sub_entry + +.align 32 +.Lmulx4x_sub: + mov 0*8($nptr,$num),%r8 + mov 1*8($nptr,$num),%r9 + not %r8 +.Lmulx4x_sub_entry: + mov 2*8($nptr,$num),%r10 + not %r9 + and %r15,%r8 + mov 3*8($nptr,$num),%r11 + not %r10 + and %r15,%r9 + not %r11 + and %r15,%r10 + and %r15,%r11 + + neg %rdx # mov %rdx,%cf + adc 0*8($tptr),%r8 + adc 1*8($tptr),%r9 + movdqa %xmm0,($tptr) + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + movdqa %xmm0,16($tptr) + lea 4*8($tptr),$tptr + sbb %rdx,%rdx # mov %cf,%rdx + + mov %r8,0*8($rptr) + mov %r9,1*8($rptr) + mov %r10,2*8($rptr) + mov %r11,3*8($rptr) + lea 4*8($rptr),$rptr + + add \$32,$num + jnz .Lmulx4x_sub + + mov 56(%rsp),%rsi # restore %rsp + mov \$1,%rax + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmulx4x_epilogue: + ret +.size bn_mulx4x_mont,.-bn_mulx4x_mont +___ +} { +###################################################################### +# void bn_sqr8x_mont( +my $rptr="%rdi"; # const BN_ULONG *rptr, +my $aptr="%rsi"; # const BN_ULONG *aptr, +my $bptr="%rdx"; # not used +my $nptr="%rcx"; # const BN_ULONG *nptr, +my $n0 ="%r8"; # const BN_ULONG *n0); +my $num ="%r9"; # int num, has to be divisible by 8 + +my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); +my @A0=("%r10","%r11"); +my @A1=("%r12","%r13"); +my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); + +$code.=<<___; +.type bn_sqrx8x_mont,\@function,6 +.align 32 +bn_sqrx8x_mont: +.Lsqrx8x_enter: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + shl \$3,${num}d # convert $num to bytes + xor %r10,%r10 + mov %rsp,%r11 # put aside %rsp + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -64(%rsp,%r10,2),%rsp # alloca(frame+2*$num) + and \$-1024,%rsp # minimize TLB usage + ############################################################## + # Stack layout + # + # +0 saved $num, used in reduction section + # +8 &t[2*$num], used in reduction section + # +16 intermediate carry bit + # +24 top-most carry bit, used in reduction section + # +32 saved *n0 + # +48 t[2*$num] + # + movq $rptr,%xmm1 # save $rptr + movq $nptr,%xmm2 # save $nptr + movq %r10, %xmm3 # -$num + movq %r11, %xmm4 # save original %rsp + mov $n0, 32(%rsp) +.Lsqrx8x_body: + ################################################################## + # Squaring part: + # + # a) multiply-n-add everything but a[i]*a[i]; + # b) shift result of a) by 1 to the left and accumulate + # a[i]*a[i] products; + # + ################################################################## + # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] + # a[1]a[0] + # a[2]a[0] + # a[3]a[0] + # a[2]a[1] + # a[3]a[1] + # a[3]a[2] + # + # a[4]a[0] + # a[5]a[0] + # a[6]a[0] + # a[7]a[0] + # a[4]a[1] + # a[5]a[1] + # a[6]a[1] + # a[7]a[1] + # a[4]a[2] + # a[5]a[2] + # a[6]a[2] + # a[7]a[2] + # a[4]a[3] + # a[5]a[3] + # a[6]a[3] + # a[7]a[3] + # + # a[5]a[4] + # a[6]a[4] + # a[7]a[4] + # a[6]a[5] + # a[7]a[5] + # a[7]a[6] + # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] +___ +{ +my ($zero,$carry)=("%rbp","%rcx"); +my $aaptr=$zero; +$code.=<<___; + pxor %xmm0,%xmm0 + lea 48(%rsp),$tptr + lea ($aptr,$num),$aaptr + mov $num,(%rsp) # save $num + mov $aaptr,8(%rsp) # save end of $aptr + jmp .Lsqr8x_zero_start + +.Lsqrx8x_zero: + movdqa %xmm0,0*8($tptr) + movdqa %xmm0,2*8($tptr) + movdqa %xmm0,4*8($tptr) + movdqa %xmm0,6*8($tptr) +.Lsqr8x_zero_start: + movdqa %xmm0,8*8($tptr) + movdqa %xmm0,10*8($tptr) + movdqa %xmm0,12*8($tptr) + movdqa %xmm0,14*8($tptr) + lea 16*8($tptr),$tptr + sub \$64,$num + jnz .Lsqrx8x_zero + + mov 0*8($aptr),%rdx # a[0], modulo-scheduled + xor %r8,%r8 + xor %r9,%r9 + xor %r10,%r10 + xor %r11,%r11 + xor %r12,%r12 + xor %r13,%r13 + xor %r14,%r14 + lea 48(%rsp),$tptr + xor $zero,$zero # cf=0, cf=0 + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulx 1*8($aptr),%rax,%rbx # a[1]*a[0] + adcx %rax,%r8 # a[1]*a[0]+=t[1] + adox %rbx,%r9 + mulx 2*8($aptr),%rax,%rbx # a[2]*a[0] + adcx %rax,%r9 + adox %rbx,%r10 + .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%rax,%rbx # ... + adcx %rax,%r10 + adox %rbx,%r11 + .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%rax,%rbx + adcx %rax,%r11 + adox %rbx,%r12 + mulx 5*8($aptr),%rax,%rbx + adcx %rax,%r12 + adox %rbx,%r13 + mulx 6*8($aptr),%rax,%rbx + adcx %rax,%r13 + adox %rbx,%r14 + mulx 7*8($aptr),%rax,%r15 + mov 1*8($aptr),%rdx # a[1] + adcx %rax,%r14 + adox $zero,%r15 + adc 8*8($tptr),%r15 + sbb $carry,$carry # mov %cf,$carry + xor $zero,$zero # cf=0, of=0 + + mov %r8,1*8($tptr) # t[1] + mov %r9,2*8($tptr) # t[2] + + mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] + mulx 3*8($aptr),%r9,%rax # a[3]*a[1] + adcx %r10,%r8 + adox %rbx,%r9 + mulx 4*8($aptr),%r10,%rbx # ... + adcx %r11,%r9 + adox %rax,%r10 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax + adcx %r12,%r10 + adox %rbx,%r11 + .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx + adcx %r13,%r11 + adox %r14,%r12 + .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 + mov 2*8($aptr),%rdx # a[2] + adcx %rax,%r12 + adox %rbx,%r13 + adcx %r15,%r13 + adox $zero,%r14 # of=0 + adcx $zero,%r14 # cf=0 + + mov %r8,3*8($tptr) # t[3] + mov %r9,4*8($tptr) # t[4] + + mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] + mulx 4*8($aptr),%r9,%rax # a[4]*a[2] + adcx %r10,%r8 + adox %rbx,%r9 + mulx 5*8($aptr),%r10,%rbx # ... + adcx %r11,%r9 + adox %rax,%r10 + .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax + adcx %r12,%r10 + adox %r13,%r11 + .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 + .byte 0x3e + mov 3*8($aptr),%rdx # a[3] + adcx %rbx,%r11 + adox %rax,%r12 + adcx %r14,%r12 + adox $zero,%r13 # of=0 + adcx $zero,%r13 # cf=0 + + mov %r8,5*8($tptr) # t[5] + mov %r9,6*8($tptr) # t[6] + + mulx 4*8($aptr),%r8,%rax # a[4]*a[3] + mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] + adcx %r10,%r8 + adox %rax,%r9 + mulx 6*8($aptr),%r10,%rax # ... + adcx %r11,%r9 + adox %r12,%r10 + mulx 7*8($aptr),%r11,%r12 + mov 4*8($aptr),%rdx # a[4] + mov 5*8($aptr),%r14 # a[5] + adcx %rbx,%r10 + adox %rax,%r11 + mov 6*8($aptr),%r15 # a[6] + adcx %r13,%r11 + adox $zero,%r12 # of=0 + adcx $zero,%r12 # cf=0 + + mov %r8,7*8($tptr) # t[7] + mov %r9,8*8($tptr) # t[8] + + mulx %r14,%r9,%rax # a[5]*a[4] + mov 7*8($aptr),%r8 # a[7] + adcx %r10,%r9 + mulx %r15,%r10,%rbx # a[6]*a[4] + adox %rax,%r10 + adcx %r11,%r10 + mulx %r8,%r11,%rax # a[7]*a[4] + mov %r14,%rdx # a[5] + adox %rbx,%r11 + adcx %r12,%r11 + #adox $zero,%rax # of=0 + adcx $zero,%rax # cf=0 + + mulx %r15,%r14,%rbx # a[6]*a[5] + mulx %r8,%r12,%r13 # a[7]*a[5] + mov %r15,%rdx # a[6] + lea 8*8($aptr),$aptr + adcx %r14,%r11 + adox %rbx,%r12 + adcx %rax,%r12 + .byte 0x66,0x66 + adox $zero,%r13 + + mulx %r8,%r8,%r14 # a[7]*a[6] + adcx %r8,%r13 + adcx $zero,%r14 + + cmp 8(%rsp),$aptr + je .Lsqrx8x_outer_break + + neg $carry # mov $carry,%cf + mov $zero,%r15 + mov 8*8($tptr),%r8 + adc 9*8($tptr),%r9 # +=t[9] + adc 10*8($tptr),%r10 # ... + adc 11*8($tptr),%r11 + adc 12*8($tptr),%r12 + adc 13*8($tptr),%r13 + adc 14*8($tptr),%r14 + adc 15*8($tptr),%r15 + lea 8*8($tptr),$tptr + sbb $carry,$carry # mov %cf,$carry + + mov -64($aptr),%rdx # a[0] + lea ($aptr),$aaptr + mov $carry,16(%rsp) # offload $carry + mov $tptr,24(%rsp) + + lea 8*8($tptr),$tptr + xor %eax,%eax # cf=0, of=0 + mov \$-8,%rcx + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + mov %r8,%rbx + mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] + adcx %rax,%rbx # +=t[8] + adox %r9,%r8 + + mulx 1*8($aaptr),%rax,%r9 # ... + adcx %rax,%r8 + adox %r10,%r9 + + mulx 2*8($aaptr),%rax,%r10 + adcx %rax,%r9 + adox %r11,%r10 + + mulx 3*8($aaptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + + mulx 5*8($aaptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 6*8($aaptr),%rax,%r14 + mov %rbx,($tptr,%rcx,8) # store t[8+i] + mov \$0,%ebx + adcx %rax,%r13 + adox %r15,%r14 + + .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 + mov 8($aptr,%rcx,8),%rdx # a[i] + adcx %rax,%r14 + adox %rbx,%r15 # %rbx is 0, of=0 + adcx %rbx,%r15 # cf=0 + + inc %rcx # of=0 + jnz .Lsqrx8x_loop + + lea 8*8($aaptr),$aaptr + cmp 8(%rsp),$aaptr # done? + je .Lsqrx8x_break + + sub 16(%rsp),%rbx # mov 16(%rsp),%cf + mov -64($aptr),%rdx + adc 0*8($tptr),%r8 + adc 1*8($tptr),%r9 + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + adc 4*8($tptr),%r12 + adc 5*8($tptr),%r13 + adc 6*8($tptr),%r14 + adc 7*8($tptr),%r15 + lea 8*8($tptr),$tptr + sbb %rbx,%rbx # mov %cf,%rbx + xor %eax,%eax # cf=0, of=0 + mov %rbx,16(%rsp) # offload carry + mov \$-8,%rcx + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + sub 16(%rsp),%r8 # consume last carry + mov 24(%rsp),$aaptr # initial $tptr + mov 0*8($aptr),%rdx # a[8], modulo-scheduled + mov %r8,0*8($tptr) + lea 8*8($aaptr),$aaptr + mov %r9,1*8($tptr) + mov 1*8($aaptr),%r8 # potentially forwarded store + mov %r10,2*8($tptr) + mov 2*8($aaptr),%r9 # ... + mov %r11,3*8($tptr) + mov 3*8($aaptr),%r10 + mov %r12,4*8($tptr) + mov 4*8($aaptr),%r11 + mov %r13,5*8($tptr) + mov 5*8($aaptr),%r12 + mov %r14,6*8($tptr) + mov 6*8($aaptr),%r13 + mov %r15,7*8($tptr) + mov 7*8($aaptr),%r14 + mov $aaptr,$tptr + xor $zero,$zero # cf=0, cf=0 + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + mov %r9,9*8($tptr) # t[9] + movq %xmm3,%rcx # -$num + mov %r10,10*8($tptr) # ... + mov %r11,11*8($tptr) + mov %r12,12*8($tptr) + mov %r13,13*8($tptr) + mov %r14,14*8($tptr) +___ +} { +my $i="%rcx"; +$code.=<<___; + mov (%rsp),$num # restore $num + + lea 48(%rsp),$tptr + mov ($aptr,$i),%rdx # a[0] + + mov 8($tptr),$A0[1] # t[1] + xor $A0[0],$A0[0] # t[0], of=0, cf=0 + adox $A0[1],$A0[1] + mov 16($tptr),$A1[0] # t[2] # prefetch + mov 24($tptr),$A1[1] # t[3] # prefetch + nop + #jmp .Lsqrx4x_shift_n_add # happens to be aligned + +.align 32 +.Lsqrx4x_shift_n_add: + mulx %rdx,%rax,%rbx + adox $A1[0],$A1[0] + adcx $A0[0],%rax + .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch + .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch + adox $A1[1],$A1[1] + adcx $A0[1],%rbx + mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch + mov %rax,0($tptr) + mov %rbx,8($tptr) + + mulx %rdx,%rax,%rbx + adox $A0[0],$A0[0] + adcx $A1[0],%rax + mov 16($aptr,$i),%rdx # a[i+2] # prefetch + mov 48($tptr),$A1[0] # t[2*i+6] # prefetch + adox $A0[1],$A0[1] + adcx $A1[1],%rbx + mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch + mov %rax,16($tptr) + mov %rbx,24($tptr) + + mulx %rdx,%rax,%rbx + adox $A1[0],$A1[0] + adcx $A0[0],%rax + mov 24($aptr,$i),%rdx # a[i+3] # prefetch + lea 32($i),$i + mov 64($tptr),$A0[0] # t[2*i+8] # prefetch + adox $A1[1],$A1[1] + adcx $A0[1],%rbx + mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch + mov %rax,32($tptr) + mov %rbx,40($tptr) + + mulx %rdx,%rax,%rbx + adox $A0[0],$A0[0] + adcx $A1[0],%rax + jrcxz .Lsqrx4x_shift_n_add_break + .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch + adox $A0[1],$A0[1] + adcx $A1[1],%rbx + mov 80($tptr),$A1[0] # t[2*i+10] # prefetch + mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch + mov %rax,48($tptr) + mov %rbx,56($tptr) + lea 64($tptr),$tptr + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcx $A1[1],%rbx + .byte 0x48,0x89,0x87,0x30,0x00,0x00,0x00 # mov %rax,48($tptr) + .byte 0x48,0x89,0x9f,0x38,0x00,0x00,0x00 # mov %rbx,56($tptr) + .byte 0x48,0x8d,0xbf,0x40,0x00,0x00,0x00 # lea 64($tptr),$tptr +___ +} +###################################################################### +# Montgomery reduction part, "word-by-word" algorithm. +# +# This new path is inspired by multiple submissions from Intel, by +# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, +# Vinodh Gopal... +{ +my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); + +$code.=<<___; + movq %xmm2,$nptr + mov 32(%rsp),%rbx # n0 + mov 48(%rsp),%rdx # "%r8", 8*0($tptr) + lea ($nptr,$num),%rax # end of n[] + #lea 48(%rsp,$num,2),$tptr # end of t[] buffer + mov %rax, 0(%rsp) # save end of n[] + mov $tptr,8(%rsp) # save end of t[] + + lea 48(%rsp),$tptr # initial t[] window + xor %rax,%rax + nop + #jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + mov 8*1($tptr),%r9 + mov 8*2($tptr),%r10 + mov 8*3($tptr),%r11 + mov 8*4($tptr),%r12 + mov %rdx,%r8 + imulq %rbx,%rdx # n0*a[i] + mov 8*5($tptr),%r13 + mov 8*6($tptr),%r14 + mov 8*7($tptr),%r15 + mov %rax,24(%rsp) # store top-most carry bit + + lea 8*8($tptr),$tptr + xor $carry,$carry # cf=0,of=0 + mov \$-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + mov %r8, %rbx + mulx 8*0($nptr),%rax,%r8 # n[0] + adcx %rbx,%rax # discarded + adox %r9,%r8 + + mulx 8*1($nptr),%rbx,%r9 # n[1] + adcx %rbx,%r8 + adox %r10,%r9 + + mulx 8*2($nptr),%rbx,%r10 + adcx %rbx,%r9 + adox %r11,%r10 + + mulx 8*3($nptr),%rbx,%r11 + adcx %rbx,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 + mov %rdx,%rax + mov %r8,%rdx + adcx %rbx,%r11 + adox %r13,%r12 + + mulx 32(%rsp),%rbx,%rdx # %rdx discarded + mov %rax,%rdx + mov %rax,48+64(%rsp,%rcx,8) # put aside n0*a[i] + + mulx 8*5($nptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 8*6($nptr),%rax,%r14 + adcx %rax,%r13 + adox %r15,%r14 + + mulx 8*7($nptr),%rax,%r15 + mov %rbx,%rdx + adcx %rax,%r14 + adox $carry,%r15 # $carry is 0 + adcx $carry,%r15 # cf=0 + + inc %rcx # of=0 + jnz .Lsqrx8x_reduce + + lea 8*8($nptr),$nptr + xor %rax,%rax + cmp 0(%rsp),$nptr # end of n[]? + jae .Lsqrx8x_no_tail + + mov 48(%rsp),%rdx # pull n0*a[0] + add 8*0($tptr),%r8 + adcx 8*1($tptr),%r9 + adcx 8*2($tptr),%r10 + adcx 8*3($tptr),%r11 + adcx 8*4($tptr),%r12 + adcx 8*5($tptr),%r13 + adcx 8*6($tptr),%r14 + adcx 8*7($tptr),%r15 + lea 8*8($tptr),$tptr + sbb $carry,$carry # top carry + + mov \$-8,%rcx + mov $carry,16(%rsp) + xor $carry,$carry # of=0, cf=0 + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + mov %r8,%rbx + mulx 8*0($nptr),%rax,%r8 + adcx %rax,%rbx + adox %r9,%r8 + + mulx 8*1($nptr),%rax,%r9 + adcx %rax,%r8 + adox %r10,%r9 + + mulx 8*2($nptr),%rax,%r10 + adcx %rax,%r9 + adox %r11,%r10 + + mulx 8*3($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + + .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 + adcx %rax,%r11 + adox %r13,%r12 + + mulx 8*5($nptr),%rax,%r13 + adcx %rax,%r12 + adox %r14,%r13 + + mulx 8*6($nptr),%rax,%r14 + adcx %rax,%r13 + adox %r15,%r14 + + mulx 8*7($nptr),%rax,%r15 + mov 48+72(%rsp,%rcx,8),%rdx # pull n0*a[i] + adcx %rax,%r14 + .byte 0x66 + adox $carry,%r15 + mov %rbx,($tptr,%rcx,8) # save result + mov %r8,%rbx + adcx $carry,%r15 # cf=0 + + inc %rcx # of=0 + jnz .Lsqrx8x_tail + + lea 8*8($nptr),$nptr + cmp 0(%rsp),$nptr # end of n[]? + jae .Lsqrx8x_tail_done # break out of loop + + sub 16(%rsp),$carry # neg $carry + mov 48(%rsp),%rdx # pull n0*a[0] + adcx 8*0($tptr),%r8 + adcx 8*1($tptr),%r9 + adcx 8*2($tptr),%r10 + adcx 8*3($tptr),%r11 + adcx 8*4($tptr),%r12 + adcx 8*5($tptr),%r13 + adcx 8*6($tptr),%r14 + adcx 8*7($tptr),%r15 + lea 8*8($tptr),$tptr + sbb $carry,$carry + + mov \$-8,%rcx + mov $carry,16(%rsp) + xor $carry,$carry # of=0, cf=0 + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + add 24(%rsp),%r8 # can this overflow? + xor %rax,%rax + + sub 16(%rsp),$carry # neg $carry +.Lsqrx8x_no_tail: # carry flag is 0 + adc 8*0($tptr),%r8 + movq %xmm3,%rcx + adc 8*1($tptr),%r9 + movq %xmm2,$nptr # restore $nptr + adc 8*2($tptr),%r10 + lea 8*8($tptr),$carry # borrow $carry + adc 8*3($tptr),%r11 + adc 8*4($tptr),%r12 + adc 8*5($tptr),%r13 + adc 8*6($tptr),%r14 + adc 8*7($tptr),%r15 + adc %rax,%rax # top-most carry + + cmp 8(%rsp),$carry # end of t[]? + mov 32(%rsp),%rbx # n0 + mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" + + lea 8*8($tptr,%rcx),$tptr # start of current t[] window + mov %r8,-8*8($carry) # store top 512 bits + mov %r9,-8*7($carry) + mov %r10,-8*6($carry) + mov %r11,-8*5($carry) + mov %r12,-8*4($carry) + mov %r13,-8*3($carry) + mov %r14,-8*2($carry) + mov %r15,-8*1($carry) + + jb .Lsqrx8x_reduction_loop + + mov %rcx,$num + neg $num # restore $num +___ +} +############################################################## +# Post-condition, 8x unrolled +# +{ +my ($rptr,$nptr,$lptr,$i)=($aptr,"%rbp","%rbx","%rcx"); +my @ri=map("%r$_",(10..13)); +my @ni=map("%r$_",(14..15)); +$code.=<<___; + lea ($nptr,$num),$nptr # end of $nptr + lea 48(%rsp,$num),$lptr # end of lower half of t[2*num] + lea 48(%rsp,$num),$tptr + neg %rax # top-most carry as mask + xor %rdx,%rdx + movq %xmm1,$rptr # restore $rptr + + mov 0*8($nptr,$i),%r8 + mov 1*8($nptr,$i),%r9 + neg %r8 + jmp .Lsqrx8x_sub_entry + +.align 32 +.Lsqrx8x_sub: + mov 0*8($nptr,$i),%r8 + mov 1*8($nptr,$i),%r9 + not %r8 +.Lsqrx8x_sub_entry: + mov 2*8($nptr,$i),%r10 + not %r9 + and %rax,%r8 + mov 3*8($nptr,$i),%r11 + not %r10 + and %rax,%r9 + mov 4*8($nptr,$i),%r12 + not %r11 + and %rax,%r10 + mov 5*8($nptr,$i),%r13 + not %r12 + and %rax,%r11 + mov 6*8($nptr,$i),%r14 + not %r13 + and %rax,%r12 + mov 7*8($nptr,$i),%r15 + not %r14 + and %rax,%r13 + movdqa %xmm0,0*8($lptr,$i) # zap lower half + not %r15 + and %rax,%r14 + movdqa %xmm0,2*8($lptr,$i) + and %rax,%r15 + + neg %rdx # mov %rdx,%cf + movdqa %xmm0,4*8($lptr,$i) + adc 0*8($tptr),%r8 + adc 1*8($tptr),%r9 + movdqa %xmm0,6*8($lptr,$i) + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + movdqa %xmm0,0*8($tptr) # zap upper half + adc 4*8($tptr),%r12 + adc 5*8($tptr),%r13 + movdqa %xmm0,2*8($tptr) + adc 6*8($tptr),%r14 + adc 7*8($tptr),%r15 + movdqa %xmm0,4*8($tptr) + sbb %rdx,%rdx # mov %cf,%rdx + movdqa %xmm0,6*8($tptr) + lea 8*8($tptr),$tptr + + mov %r8,0*8($rptr) + mov %r9,1*8($rptr) + mov %r10,2*8($rptr) + mov %r11,3*8($rptr) + mov %r12,4*8($rptr) + mov %r13,5*8($rptr) + mov %r14,6*8($rptr) + mov %r15,7*8($rptr) + lea 8*8($rptr),$rptr + + add \$64,$i + jnz .Lsqrx8x_sub +___ +} +$code.=<<___; + movq %xmm4,%rsi # restore %rsp + mov \$1,%rax + mov 0(%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lsqrx8x_epilogue: + ret +.size bn_sqrx8x_mont,.-bn_sqrx8x_mont +___ +}}} $code.=<<___; .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " .align 16 @@ -1708,13 +2840,18 @@ sqr_handler: mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip - lea .Lsqr8x_body(%rip),%r10 + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp - lea .Lsqr8x_epilogue(%rip),%r10 + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail @@ -1787,7 +2924,17 @@ sqr_handler: .rva .LSEH_begin_bn_sqr8x_mont .rva .LSEH_end_bn_sqr8x_mont .rva .LSEH_info_bn_sqr8x_mont +___ +$code.=<<___ if ($addx); + .rva .LSEH_begin_bn_mulx4x_mont + .rva .LSEH_end_bn_mulx4x_mont + .rva .LSEH_info_bn_mulx4x_mont + .rva .LSEH_begin_bn_sqrx8x_mont + .rva .LSEH_end_bn_sqrx8x_mont + .rva .LSEH_info_bn_sqrx8x_mont +___ +$code.=<<___; .section .xdata .align 8 .LSEH_info_bn_mul_mont: @@ -1801,6 +2948,17 @@ sqr_handler: .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler + .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] +___ +$code.=<<___ if ($addx); +.LSEH_info_bn_mulx4x_mont: + .byte 9,0,0,0 + .rva sqr_handler + .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] +.LSEH_info_bn_sqrx8x_mont: + .byte 9,0,0,0 + .rva sqr_handler + .rva .Lsqrx8x_body,.Lsqrx8x_epilogue # HandlerData[] ___ } diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 8f8dc5a59..7a691eb05 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -31,6 +31,21 @@ die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.22); +} + +if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $addx = ($1>=2.10); +} + +if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $addx = ($1>=11); +} + # int bn_mul_mont_gather5( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, @@ -53,6 +68,8 @@ $m1="%rbp"; $code=<<___; .text +.extern OPENSSL_ia32cap_P + .globl bn_mul_mont_gather5 .type bn_mul_mont_gather5,\@function,6 .align 64 @@ -61,6 +78,11 @@ bn_mul_mont_gather5: jnz .Lmul_enter cmp \$8,${num}d jb .Lmul_enter +___ +$code.=<<___ if ($addx); + mov OPENSSL_ia32cap_P+8(%rip),%r11d +___ +$code.=<<___; jmp .Lmul4x_enter .align 16 @@ -347,6 +369,13 @@ $code.=<<___; .align 16 bn_mul4x_mont_gather5: .Lmul4x_enter: +___ +$code.=<<___ if ($addx); + and \$0x80100,%r11d + cmp \$0x80100,%r11d + je .Lmulx4x_enter +___ +$code.=<<___; mov ${num}d,${num}d mov `($win64?56:8)`(%rsp),%r10d # load 7th argument push %rbx @@ -828,7 +857,404 @@ $code.=<<___; .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 ___ }}} +if ($addx) {{{ +my $bp="%rdx"; # original value +$code.=<<___; +.type bn_mulx4x_mont_gather5,\@function,6 +.align 32 +bn_mulx4x_mont_gather5: +.Lmulx4x_enter: + mov %rsp,%rax + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 +___ +$code.=<<___ if ($win64); + lea -0x28(%rsp),%rsp + movaps %xmm6,(%rsp) + movaps %xmm7,0x10(%rsp) +___ +$code.=<<___; + shl \$3,${num}d # convert $num to bytes + xor %r10,%r10 + mov %rsp,%r11 # put aside %rsp + sub $num,%r10 # -$num + mov ($n0),$n0 # *n0 + lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) + and \$-128,%rsp + ############################################################## + # Stack layout + # +0 num + # +8 off-loaded &b[i] + # +16 end of b[num] + # +24 saved n0 + # +32 saved rp + # +40 + # +48 inner counter + # +56 saved %rsp + # +64 tmp[num+1] + # + mov $num,0(%rsp) # save $num + shl \$5,$num + lea 256($bp,$num),%r10 + shr \$5+5,$num + mov %r10,16(%rsp) # end of b[num] + sub \$1,$num + mov $n0, 24(%rsp) # save *n0 + mov $rp, 32(%rsp) # save $rp + mov $num,48(%rsp) # inner counter + mov %r11,56(%rsp) # save original %rsp + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: +___ +my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= + ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); +my $rptr=$bptr; +my $STRIDE=2**5*8; # 5 is "window size" +my $N=$STRIDE/4; # should match cache line size +$code.=<<___; + mov `($win64?56:8)`(%rax),%r10d # load 7th argument + mov %r10,%r11 + shr \$`log($N/8)/log(2)`,%r10 + and \$`$N/8-1`,%r11 + not %r10 + lea .Lmagic_masks(%rip),%rax + and \$`2**5/($N/8)-1`,%r10 # 5 is "window size" + lea 96($bp,%r11,8),$bptr # pointer within 1st cache line + movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which + movq 8(%rax,%r10,8),%xmm5 # cache line contains element + movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument + movq 24(%rax,%r10,8),%xmm7 + + movq `0*$STRIDE/4-96`($bptr),%xmm0 + movq `1*$STRIDE/4-96`($bptr),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bptr),%xmm2 + pand %xmm5,%xmm1 + movq `3*$STRIDE/4-96`($bptr),%xmm3 + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + por %xmm2,%xmm0 + lea $STRIDE($bptr),$bptr + por %xmm3,%xmm0 + + movq %xmm0,%rdx # bp[0] + movq `0*$STRIDE/4-96`($bptr),%xmm0 + movq `1*$STRIDE/4-96`($bptr),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bptr),%xmm2 + pand %xmm5,%xmm1 + + lea 64+32(%rsp),$tptr + mov %rdx,$bi + xor $zero,$zero # of=0,cf=0 + + mulx 0*8($aptr),$mi,%rax # a[0]*b[0] + mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%r13 # ... + adcx %r14,%r12 + adcx $zero,%r13 + + movq `3*$STRIDE/4-96`($bptr),%xmm3 + lea $STRIDE($bptr),%r10 # next &b[i] + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + mov $mi,$bptr # borrow $bptr + imulq 24(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + + por %xmm2,%xmm0 + por %xmm3,%xmm0 + mov %r10,8(%rsp) # off-load &b[i] + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + lea 4*8($aptr),$aptr + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,$bptr # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + mov 48(%rsp),$bptr # counter value + mov %r10,-4*8($tptr) + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + lea 4*8($nptr),$nptr + mov %r12,-2*8($tptr) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcx $zero,%r15 # cf=0, modulo-scheduled + mulx 0*8($aptr),%r10,%rax # a[4]*b[0] + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + .byte 0x66,0x66 + mov $mi,%rdx + adcx %rax,%r13 + adcx $zero,%r14 # cf=0 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + + adox %r15,%r10 + mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + .byte 0x3e + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + mov %r11,-4*8($tptr) + adcx %rax,%r12 + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_1st + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + movq %xmm0,%rdx # bp[1] + adc $zero,%r15 # modulo-scheduled + add %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + sub $num,$aptr # rewind $aptr + mov %r15,($tptr) # save top-most carry + mov 64(%rsp),%r10 + lea 64(%rsp),$tptr + sub $num,$nptr # rewind $nptr + xor $zero,$zero # cf=0, of=0 + mov %rdx,$bi + + movq `0*$STRIDE/4-96`($bptr),%xmm0 + movq `1*$STRIDE/4-96`($bptr),%xmm1 + pand %xmm4,%xmm0 + movq `2*$STRIDE/4-96`($bptr),%xmm2 + pand %xmm5,%xmm1 + + mulx 0*8($aptr),$mi,%rax # a[0]*b[i] + adox %r10,$mi + mov 1*8($tptr),%r10 + mulx 1*8($aptr),%r11,%r14 # a[1]*b[i] + adcx %rax,%r11 + mulx 2*8($aptr),%r12,%r13 # ... + adox %r10,%r11 + adcx %r14,%r12 + adox $zero,%r12 + adcx $zero,%r13 + + movq `3*$STRIDE/4-96`($bptr),%xmm3 + lea $STRIDE($bptr),%r10 # next &b[i] + pand %xmm6,%xmm2 + por %xmm1,%xmm0 + pand %xmm7,%xmm3 + + mov $mi,$bptr # borrow $bptr + imulq 24(%rsp),$mi # "t[0]"*n0 + xor $zero,$zero # cf=0, of=0 + + por %xmm2,%xmm0 + por %xmm3,%xmm0 + mov %r10,8(%rsp) # off-load &b[i] + mov 2*8($tptr),%r10 + + mulx 3*8($aptr),%rax,%r14 + mov $mi,%rdx + adox %r10,%r12 + adcx %rax,%r13 + adox 3*8($tptr),%r13 + adcx $zero,%r14 + lea 4*8($aptr),$aptr + lea 4*8($tptr),$tptr + adox $zero,%r14 + + mulx 0*8($nptr),%rax,%r10 + adcx %rax,$bptr # discarded + adox %r11,%r10 + mulx 1*8($nptr),%rax,%r11 + adcx %rax,%r10 + adox %r12,%r11 + mulx 2*8($nptr),%rax,%r12 + .byte 0x3e + mov %r10,-4*8($tptr) + .byte 0x3e + mov 0*8($tptr),%r10 + adcx %rax,%r11 + adox %r13,%r12 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-3*8($tptr) + adcx %rax,%r12 + adox $zero,%r15 # of=0 + mov 48(%rsp),$bptr # counter value + mov %r12,-2*8($tptr) + lea 4*8($nptr),$nptr + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + adcx $zero,%r15 # cf=0, modulo-scheduled + adox %r10,%r14 + mulx 0*8($aptr),%r10,%rax # a[4]*b[i] + mov 1*8($tptr),%r13 + adcx %r14,%r10 + mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] + adox %rax,%r11 + mulx 2*8($aptr),%r12,%rax # ... + adcx %r13,%r11 + adox %r14,%r12 + mulx 3*8($aptr),%r13,%r14 + mov $mi,%rdx + adcx 2*8($tptr),%r12 + adox %rax,%r13 + adcx 3*8($tptr),%r13 + adox $zero,%r14 # of=0 + lea 4*8($aptr),$aptr + .byte 0x48,0x8d,0x9b,0x20,0x00,0x00,0x00 # lea 4*8($tptr),$tptr + adcx $zero,%r14 # cf=0 + + adox %r15,%r10 + .byte 0x3e,0xc4,0x62,0xfb,0xf6,0x79,0x00 # mulx 0*8($nptr),%rax,%r15 + adcx %rax,%r10 + adox %r15,%r11 + mulx 1*8($nptr),%rax,%r15 + adcx %rax,%r11 + adox %r15,%r12 + mulx 2*8($nptr),%rax,%r15 + mov %r10,-5*8($tptr) + mov 0*8($tptr),%r10 + adcx %rax,%r12 + adox %r15,%r13 + mulx 3*8($nptr),%rax,%r15 + mov $bi,%rdx + mov %r11,-4*8($tptr) + mov %r12,-3*8($tptr) + adcx %rax,%r13 + adox $zero,%r15 + lea 4*8($nptr),$nptr + mov %r13,-2*8($tptr) + + dec $bptr # of=0, pass cf + jnz .Lmulx4x_inner + + mov 0(%rsp),$num # load num + mov 8(%rsp),$bptr # re-load &b[i] + movq %xmm0,%rdx # bp[i+1] + adc $zero,%r15 # modulo-scheduled + sub %r10,$zero # pull top-most carry + adc %r15,%r14 + sbb %r15,%r15 # top-most carry + mov %r14,-1*8($tptr) + + cmp 16(%rsp),$bptr + jb .Lmulx4x_outer + + neg $num + mov 32(%rsp),$rptr # restore rp + lea 64(%rsp),$tptr + + xor %rdx,%rdx + pxor %xmm0,%xmm0 + mov 0*8($nptr,$num),%r8 + mov 1*8($nptr,$num),%r9 + neg %r8 + jmp .Lmulx4x_sub_entry + +.align 32 +.Lmulx4x_sub: + mov 0*8($nptr,$num),%r8 + mov 1*8($nptr,$num),%r9 + not %r8 +.Lmulx4x_sub_entry: + mov 2*8($nptr,$num),%r10 + not %r9 + and %r15,%r8 + mov 3*8($nptr,$num),%r11 + not %r10 + and %r15,%r9 + not %r11 + and %r15,%r10 + and %r15,%r11 + + neg %rdx # mov %rdx,%cf + adc 0*8($tptr),%r8 + adc 1*8($tptr),%r9 + movdqa %xmm0,($tptr) + adc 2*8($tptr),%r10 + adc 3*8($tptr),%r11 + movdqa %xmm0,16($tptr) + lea 4*8($tptr),$tptr + sbb %rdx,%rdx # mov %cf,%rdx + + mov %r8,0*8($rptr) + mov %r9,1*8($rptr) + mov %r10,2*8($rptr) + mov %r11,3*8($rptr) + lea 4*8($rptr),$rptr + + add \$32,$num + jnz .Lmulx4x_sub + + mov 56(%rsp),%rsi # restore %rsp + mov \$1,%rax +___ +$code.=<<___ if ($win64); + movaps (%rsi),%xmm6 + movaps 0x10(%rsi),%xmm7 + lea 0x28(%rsi),%rsi +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lmulx4x_epilogue: + ret +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 +___ +}}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order