bn/asm/x86_64-mont5.pl: unify gather procedure in hardly used path

and reorganize/harmonize post-conditions.

Additional hardening following on from CVE-2016-0702

Reviewed-by: Richard Levitte <levitte@openssl.org>
Reviewed-by: Rich Salz <rsalz@openssl.org>
(cherry picked from master)
This commit is contained in:
Andy Polyakov 2016-01-26 16:50:10 +01:00 committed by Matt Caswell
parent 25d14c6c29
commit 515f3be47a
2 changed files with 280 additions and 201 deletions

View File

@ -795,7 +795,7 @@ bn_sqr8x_mont:
sub %r11,%rsp sub %r11,%rsp
.Lsqr8x_sp_done: .Lsqr8x_sp_done:
and \$-64,%rsp and \$-64,%rsp
mov $num,%r10 mov $num,%r10
neg $num neg $num
mov $n0, 32(%rsp) mov $n0, 32(%rsp)
@ -814,34 +814,87 @@ $code.=<<___ if ($addx);
jne .Lsqr8x_nox jne .Lsqr8x_nox
call bn_sqrx8x_internal # see x86_64-mont5 module call bn_sqrx8x_internal # see x86_64-mont5 module
# %rax top-most carry
pxor %xmm0,%xmm0 # %rbp nptr
lea 48(%rsp),%rax # %rcx -8*num
shr \$3+2,$num # %r8 end of tp[2*num]
mov 40(%rsp),%rsi # restore %rsp lea (%r8,%rcx),%rbx
jmp .Lsqr8x_zero mov %rcx,$num
mov %rcx,%rdx
movq %xmm1,$rptr
sar \$3+2,%rcx # %cf=0
jmp .Lsqr8x_sub
.align 32 .align 32
.Lsqr8x_nox: .Lsqr8x_nox:
___ ___
$code.=<<___; $code.=<<___;
call bn_sqr8x_internal # see x86_64-mont5 module call bn_sqr8x_internal # see x86_64-mont5 module
# %rax top-most carry
pxor %xmm0,%xmm0 # %rbp nptr
lea 48(%rsp),%rax # %r8 -8*num
shr \$3+2,$num # %rdi end of tp[2*num]
mov 40(%rsp),%rsi # restore %rsp lea (%rdi,$num),%rbx
jmp .Lsqr8x_zero mov $num,%rcx
mov $num,%rdx
movq %xmm1,$rptr
sar \$3+2,%rcx # %cf=0
jmp .Lsqr8x_sub
.align 32 .align 32
.Lsqr8x_zero: .Lsqr8x_sub:
movdqa %xmm0,16*0(%rax) # wipe t mov 8*0(%rbx),%r12
movdqa %xmm0,16*1(%rax) mov 8*1(%rbx),%r13
movdqa %xmm0,16*2(%rax) mov 8*2(%rbx),%r14
movdqa %xmm0,16*3(%rax) mov 8*3(%rbx),%r15
lea 16*4(%rax),%rax lea 8*4(%rbx),%rbx
dec $num sbb 8*0(%rbp),%r12
jnz .Lsqr8x_zero sbb 8*1(%rbp),%r13
sbb 8*2(%rbp),%r14
sbb 8*3(%rbp),%r15
lea 8*4(%rbp),%rbp
mov %r12,8*0($rptr)
mov %r13,8*1($rptr)
mov %r14,8*2($rptr)
mov %r15,8*3($rptr)
lea 8*4($rptr),$rptr
inc %rcx # preserves %cf
jnz .Lsqr8x_sub
sbb \$0,%rax # top-most carry
lea (%rbx,$num),%rbx # rewind
lea ($rptr,$num),$rptr # rewind
movq %rax,%xmm1
pxor %xmm0,%xmm0
pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp
jmp .Lsqr8x_cond_copy
.align 32
.Lsqr8x_cond_copy:
movdqa 16*0(%rbx),%xmm2
movdqa 16*1(%rbx),%xmm3
lea 16*2(%rbx),%rbx
movdqu 16*0($rptr),%xmm4
movdqu 16*1($rptr),%xmm5
lea 16*2($rptr),$rptr
movdqa %xmm0,-16*2(%rbx) # zero tp
movdqa %xmm0,-16*1(%rbx)
movdqa %xmm0,-16*2(%rbx,%rdx)
movdqa %xmm0,-16*1(%rbx,%rdx)
pcmpeqd %xmm1,%xmm0
pand %xmm1,%xmm2
pand %xmm1,%xmm3
pand %xmm0,%xmm4
pand %xmm0,%xmm5
pxor %xmm0,%xmm0
por %xmm2,%xmm4
por %xmm3,%xmm5
movdqu %xmm4,-16*2($rptr)
movdqu %xmm5,-16*1($rptr)
add \$32,$num
jnz .Lsqr8x_cond_copy
mov \$1,%rax mov \$1,%rax
mov -48(%rsi),%r15 mov -48(%rsi),%r15
@ -1108,64 +1161,75 @@ $code.=<<___;
adc $zero,%r15 # modulo-scheduled adc $zero,%r15 # modulo-scheduled
sub 0*8($tptr),$zero # pull top-most carry sub 0*8($tptr),$zero # pull top-most carry
adc %r15,%r14 adc %r15,%r14
mov -8($nptr),$mi
sbb %r15,%r15 # top-most carry sbb %r15,%r15 # top-most carry
mov %r14,-1*8($tptr) mov %r14,-1*8($tptr)
cmp 16(%rsp),$bptr cmp 16(%rsp),$bptr
jne .Lmulx4x_outer jne .Lmulx4x_outer
sub %r14,$mi # compare top-most words
sbb $mi,$mi
or $mi,%r15
neg $num
xor %rdx,%rdx
mov 32(%rsp),$rptr # restore rp
lea 64(%rsp),$tptr lea 64(%rsp),$tptr
sub $num,$nptr # rewind $nptr
pxor %xmm0,%xmm0 neg %r15
mov 0*8($nptr,$num),%r8 mov $num,%rdx
mov 1*8($nptr,$num),%r9 shr \$3+2,$num # %cf=0
neg %r8 mov 32(%rsp),$rptr # restore rp
jmp .Lmulx4x_sub_entry jmp .Lmulx4x_sub
.align 32 .align 32
.Lmulx4x_sub: .Lmulx4x_sub:
mov 0*8($nptr,$num),%r8 mov 8*0($tptr),%r11
mov 1*8($nptr,$num),%r9 mov 8*1($tptr),%r12
not %r8 mov 8*2($tptr),%r13
.Lmulx4x_sub_entry: mov 8*3($tptr),%r14
mov 2*8($nptr,$num),%r10 lea 8*4($tptr),$tptr
not %r9 sbb 8*0($nptr),%r11
and %r15,%r8 sbb 8*1($nptr),%r12
mov 3*8($nptr,$num),%r11 sbb 8*2($nptr),%r13
not %r10 sbb 8*3($nptr),%r14
and %r15,%r9 lea 8*4($nptr),$nptr
not %r11 mov %r11,8*0($rptr)
and %r15,%r10 mov %r12,8*1($rptr)
and %r15,%r11 mov %r13,8*2($rptr)
mov %r14,8*3($rptr)
neg %rdx # mov %rdx,%cf lea 8*4($rptr),$rptr
adc 0*8($tptr),%r8 dec $num # preserves %cf
adc 1*8($tptr),%r9
movdqa %xmm0,($tptr)
adc 2*8($tptr),%r10
adc 3*8($tptr),%r11
movdqa %xmm0,16($tptr)
lea 4*8($tptr),$tptr
sbb %rdx,%rdx # mov %cf,%rdx
mov %r8,0*8($rptr)
mov %r9,1*8($rptr)
mov %r10,2*8($rptr)
mov %r11,3*8($rptr)
lea 4*8($rptr),$rptr
add \$32,$num
jnz .Lmulx4x_sub jnz .Lmulx4x_sub
sbb \$0,%r15 # top-most carry
lea 64(%rsp),$tptr
sub %rdx,$rptr # rewind
movq %r15,%xmm1
pxor %xmm0,%xmm0
pshufd \$0,%xmm1,%xmm1
mov 40(%rsp),%rsi # restore %rsp mov 40(%rsp),%rsi # restore %rsp
jmp .Lmulx4x_cond_copy
.align 32
.Lmulx4x_cond_copy:
movdqa 16*0($tptr),%xmm2
movdqa 16*1($tptr),%xmm3
lea 16*2($tptr),$tptr
movdqu 16*0($rptr),%xmm4
movdqu 16*1($rptr),%xmm5
lea 16*2($rptr),$rptr
movdqa %xmm0,-16*2($tptr) # zero tp
movdqa %xmm0,-16*1($tptr)
pcmpeqd %xmm1,%xmm0
pand %xmm1,%xmm2
pand %xmm1,%xmm3
pand %xmm0,%xmm4
pand %xmm0,%xmm5
pxor %xmm0,%xmm0
por %xmm2,%xmm4
por %xmm3,%xmm5
movdqu %xmm4,-16*2($rptr)
movdqu %xmm5,-16*1($rptr)
sub \$32,%rdx
jnz .Lmulx4x_cond_copy
mov %rdx,($tptr)
mov \$1,%rax mov \$1,%rax
mov -48(%rsi),%r15 mov -48(%rsi),%r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14

View File

@ -99,25 +99,18 @@ $code.=<<___;
.Lmul_enter: .Lmul_enter:
mov ${num}d,${num}d mov ${num}d,${num}d
mov %rsp,%rax mov %rsp,%rax
movd `($win64?56:8)`(%rsp),%xmm0 # load 7th argument movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
lea .Lmagic_masks(%rip),%r10 lea .Linc(%rip),%r10
push %rbx push %rbx
push %rbp push %rbp
push %r12 push %r12
push %r13 push %r13
push %r14 push %r14
push %r15 push %r15
___
$code.=<<___ if ($win64);
lea -0x38(%rsp),%rsp
movaps %xmm6,(%rsp)
movaps %xmm7,0x10(%rsp)
movaps %xmm8,0x20(%rsp)
___
$code.=<<___;
lea 2($num),%r11 lea 2($num),%r11
neg %r11 neg %r11
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)) lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
and \$-1024,%rsp # minimize TLB usage and \$-1024,%rsp # minimize TLB usage
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
@ -128,64 +121,89 @@ ___
$STRIDE=2**5*8; # 5 is "window size" $STRIDE=2**5*8; # 5 is "window size"
$N=$STRIDE/4; # should match cache line size $N=$STRIDE/4; # should match cache line size
$code.=<<___; $code.=<<___;
################################################################ movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
# calculate mask: one of %xmm4..7 will contain 0xff..00 or movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
# 0x00..ff denoting which half of a quarter of corresponding lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
# cache line is significant. and \$-16,%r10
#
movq 56(%r10),%xmm1 # 0b11001
movq %xmm0,%rdx
pand %xmm1,%xmm0
movdqa 0(%r10),%xmm4
pshufd \$0,%xmm0,%xmm0 # broadcast masked index
movdqa 16(%r10),%xmm5
movdqa 32(%r10),%xmm6
pcmpeqd %xmm0,%xmm4
movdqa 48(%r10),%xmm7
pcmpeqd %xmm0,%xmm5
pcmpeqd %xmm0,%xmm6
pcmpeqd %xmm0,%xmm7
################################################################ pshufd \$0,%xmm5,%xmm5 # broadcast index
# calculate index in 1st cache line, but in such manner that movdqa %xmm1,%xmm4
# if target data is in another cache line, then relevant movdqa %xmm1,%xmm2
# "rotating" reference would land on it... ___
# ########################################################################
shr \$1,%rdx # idx/=2 # calculate mask by comparing 0..31 to index and save result to stack
mov %rdx,$j #
shr \$2,%rdx $code.=<<___;
sub %rdx,$j paddd %xmm0,%xmm1
and \$3,$j # (idx-idx/4)%4 pcmpeqd %xmm5,%xmm0 # compare to 1,0
shl \$4,$j # scale for xmm references .byte 0x67
movdqa %xmm4,%xmm3
___
for($k=0;$k<$STRIDE/16-4;$k+=4) {
$code.=<<___;
paddd %xmm1,%xmm2
pcmpeqd %xmm5,%xmm1 # compare to 3,2
movdqa %xmm0,`16*($k+0)+112`(%r10)
movdqa %xmm4,%xmm0
################################################################ paddd %xmm2,%xmm3
# "rotating" references are touching different cache banks in pcmpeqd %xmm5,%xmm2 # compare to 5,4
# different cache lines, so that not only all cache lines are movdqa %xmm1,`16*($k+1)+112`(%r10)
# referred in each iteration, but even all cache banks. movdqa %xmm4,%xmm1
#
lea 16($j),$m0 paddd %xmm3,%xmm0
lea 32($j),$m1 pcmpeqd %xmm5,%xmm3 # compare to 7,6
and \$63,$m0 movdqa %xmm2,`16*($k+2)+112`(%r10)
lea 48($j),%rdx movdqa %xmm4,%xmm2
and \$63,$m1
and \$63,%rdx paddd %xmm0,%xmm1
movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 pcmpeqd %xmm5,%xmm0
movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 movdqa %xmm3,`16*($k+3)+112`(%r10)
movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 movdqa %xmm4,%xmm3
movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 ___
pand %xmm4,%xmm0 }
pand %xmm5,%xmm1 $code.=<<___; # last iteration can be optimized
pand %xmm6,%xmm2 paddd %xmm1,%xmm2
por %xmm1,%xmm0 pcmpeqd %xmm5,%xmm1
pand %xmm7,%xmm3 movdqa %xmm0,`16*($k+0)+112`(%r10)
paddd %xmm2,%xmm3
.byte 0x67
pcmpeqd %xmm5,%xmm2
movdqa %xmm1,`16*($k+1)+112`(%r10)
pcmpeqd %xmm5,%xmm3
movdqa %xmm2,`16*($k+2)+112`(%r10)
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
pand `16*($k+1)-128`($bp),%xmm1
pand `16*($k+2)-128`($bp),%xmm2
movdqa %xmm3,`16*($k+3)+112`(%r10)
pand `16*($k+3)-128`($bp),%xmm3
por %xmm2,%xmm0 por %xmm2,%xmm0
lea $STRIDE($bp),$bp por %xmm3,%xmm1
por %xmm3,%xmm0 ___
movq $j,%xmm8 for($k=0;$k<$STRIDE/16-4;$k+=4) {
$code.=<<___;
movdqa `16*($k+0)-128`($bp),%xmm4
movdqa `16*($k+1)-128`($bp),%xmm5
movdqa `16*($k+2)-128`($bp),%xmm2
pand `16*($k+0)+112`(%r10),%xmm4
movdqa `16*($k+3)-128`($bp),%xmm3
pand `16*($k+1)+112`(%r10),%xmm5
por %xmm4,%xmm0
pand `16*($k+2)+112`(%r10),%xmm2
por %xmm5,%xmm1
pand `16*($k+3)+112`(%r10),%xmm3
por %xmm2,%xmm0
por %xmm3,%xmm1
___
}
$code.=<<___;
por %xmm1,%xmm0
pshufd \$0x4e,%xmm0,%xmm1 pshufd \$0x4e,%xmm0,%xmm1
por %xmm1,%xmm0 # merge upper and lower halves por %xmm1,%xmm0
lea $STRIDE($bp),$bp
movq %xmm0,$m0 # m0=bp[0] movq %xmm0,$m0 # m0=bp[0]
mov ($n0),$n0 # pull n0[0] value mov ($n0),$n0 # pull n0[0] value
@ -232,15 +250,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1 mulq $m1 # np[j]*m1
cmp $num,$j cmp $num,$j
jne .L1st jne .L1st # note that upon exit $j==$num, so
# they can be used interchangeably
movq %xmm8,$j
add %rax,$hi1 add %rax,$hi1
adc \$0,%rdx adc \$0,%rdx
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
adc \$0,%rdx adc \$0,%rdx
mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1 mov %rdx,$hi1
mov $lo0,$hi0 mov $lo0,$hi0
@ -254,27 +271,32 @@ $code.=<<___;
jmp .Louter jmp .Louter
.align 16 .align 16
.Louter: .Louter:
lea 16($j),$m0 lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
lea 32($j),$m1 and \$-16,%rdx
and \$63,$m0 pxor %xmm4,%xmm4
lea 48($j),%rdx pxor %xmm5,%xmm5
and \$63,$m1 ___
and \$63,%rdx for($k=0;$k<$STRIDE/16;$k+=4) {
movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0 $code.=<<___;
movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1 movdqa `16*($k+0)-128`($bp),%xmm0
movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2 movdqa `16*($k+1)-128`($bp),%xmm1
movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3 movdqa `16*($k+2)-128`($bp),%xmm2
pand %xmm4,%xmm0 movdqa `16*($k+3)-128`($bp),%xmm3
pand %xmm5,%xmm1 pand `16*($k+0)-128`(%rdx),%xmm0
pand %xmm6,%xmm2 pand `16*($k+1)-128`(%rdx),%xmm1
por %xmm1,%xmm0 por %xmm0,%xmm4
pand %xmm7,%xmm3 pand `16*($k+2)-128`(%rdx),%xmm2
por %xmm2,%xmm0 por %xmm1,%xmm5
pand `16*($k+3)-128`(%rdx),%xmm3
por %xmm2,%xmm4
por %xmm3,%xmm5
___
}
$code.=<<___;
por %xmm5,%xmm4
pshufd \$0x4e,%xmm4,%xmm0
por %xmm4,%xmm0
lea $STRIDE($bp),$bp lea $STRIDE($bp),$bp
por %xmm3,%xmm0
pshufd \$0x4e,%xmm0,%xmm1
por %xmm1,%xmm0 # merge upper and lower halves
mov ($ap),%rax # ap[0] mov ($ap),%rax # ap[0]
movq %xmm0,$m0 # m0=bp[i] movq %xmm0,$m0 # m0=bp[i]
@ -324,16 +346,14 @@ $code.=<<___;
mulq $m1 # np[j]*m1 mulq $m1 # np[j]*m1
cmp $num,$j cmp $num,$j
jne .Linner jne .Linner # note that upon exit $j==$num, so
# they can be used interchangeably
movq %xmm8,$j
add %rax,$hi1 add %rax,$hi1
adc \$0,%rdx adc \$0,%rdx
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
mov (%rsp,$num,8),$lo0 mov (%rsp,$num,8),$lo0
adc \$0,%rdx adc \$0,%rdx
mov $hi1,-16(%rsp,$num,8) # tp[j-1] mov $hi1,-16(%rsp,$num,8) # tp[num-1]
mov %rdx,$hi1 mov %rdx,$hi1
xor %rdx,%rdx xor %rdx,%rdx
@ -380,13 +400,7 @@ $code.=<<___;
mov 8(%rsp,$num,8),%rsi # restore %rsp mov 8(%rsp,$num,8),%rsi # restore %rsp
mov \$1,%rax mov \$1,%rax
___
$code.=<<___ if ($win64);
movaps -104(%rsi),%xmm6
movaps -88(%rsi),%xmm7
movaps -72(%rsi),%xmm8
___
$code.=<<___;
mov -48(%rsi),%r15 mov -48(%rsi),%r15
mov -40(%rsi),%r14 mov -40(%rsi),%r14
mov -32(%rsi),%r13 mov -32(%rsi),%r13
@ -1065,10 +1079,15 @@ $code.=<<___;
movq $bptr,%xmm4 movq $bptr,%xmm4
call __bn_sqr8x_internal call __bn_sqr8x_internal
call __bn_post4x_internal
call __bn_sqr8x_internal call __bn_sqr8x_internal
call __bn_post4x_internal
call __bn_sqr8x_internal call __bn_sqr8x_internal
call __bn_post4x_internal
call __bn_sqr8x_internal call __bn_sqr8x_internal
call __bn_post4x_internal
call __bn_sqr8x_internal call __bn_sqr8x_internal
call __bn_post4x_internal
movq %xmm2,$nptr movq %xmm2,$nptr
movq %xmm4,$bptr movq %xmm4,$bptr
@ -1629,7 +1648,7 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
$code.=<<___; $code.=<<___;
movq %xmm2,$nptr movq %xmm2,$nptr
sqr8x_reduction: __bn_sqr8x_reduction:
xor %rax,%rax xor %rax,%rax
lea ($nptr,$num),%rcx # end of n[] lea ($nptr,$num),%rcx # end of n[]
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
@ -1888,6 +1907,8 @@ sqr8x_reduction:
cmp %rdx,$tptr # end of t[]? cmp %rdx,$tptr # end of t[]?
jb .L8x_reduction_loop jb .L8x_reduction_loop
ret
.size bn_sqr8x_internal,.-bn_sqr8x_internal
___ ___
} }
############################################################## ##############################################################
@ -1896,13 +1917,12 @@ ___
{ {
my ($tptr,$nptr)=("%rbx","%rbp"); my ($tptr,$nptr)=("%rbx","%rbp");
$code.=<<___; $code.=<<___;
#xor %rsi,%rsi # %rsi was $carry above .type __bn_post4x_internal,\@abi-omnipotent
.align 32
__bn_post4x_internal:
mov 8*0($nptr),%r12 mov 8*0($nptr),%r12
sub %r15,%rcx # compare top-most words
lea (%rdi,$num),$tptr # %rdi was $tptr above lea (%rdi,$num),$tptr # %rdi was $tptr above
adc %rsi,%rsi
mov $num,%rcx mov $num,%rcx
or %rsi,%rax
movq %xmm1,$rptr # restore $rptr movq %xmm1,$rptr # restore $rptr
neg %rax neg %rax
movq %xmm1,$aptr # prepare for back-to-back call movq %xmm1,$aptr # prepare for back-to-back call
@ -1946,14 +1966,13 @@ $code.=<<___;
inc %rcx # pass %cf inc %rcx # pass %cf
jnz .Lsqr4x_sub jnz .Lsqr4x_sub
___
}
$code.=<<___;
mov $num,%r10 # prepare for back-to-back call mov $num,%r10 # prepare for back-to-back call
neg $num # restore $num neg $num # restore $num
ret ret
.size bn_sqr8x_internal,.-bn_sqr8x_internal .size __bn_post4x_internal,.-__bn_post4x_internal
___ ___
}
{ {
$code.=<<___; $code.=<<___;
.globl bn_from_montgomery .globl bn_from_montgomery
@ -2061,7 +2080,8 @@ $code.=<<___ if ($addx);
jne .Lfrom_mont_nox jne .Lfrom_mont_nox
lea (%rax,$num),$rptr lea (%rax,$num),$rptr
call sqrx8x_reduction call __bn_sqrx8x_reduction
call __bn_postx4x_internal
pxor %xmm0,%xmm0 pxor %xmm0,%xmm0
lea 48(%rsp),%rax lea 48(%rsp),%rax
@ -2072,7 +2092,8 @@ $code.=<<___ if ($addx);
.Lfrom_mont_nox: .Lfrom_mont_nox:
___ ___
$code.=<<___; $code.=<<___;
call sqr8x_reduction call __bn_sqr8x_reduction
call __bn_post4x_internal
pxor %xmm0,%xmm0 pxor %xmm0,%xmm0
lea 48(%rsp),%rax lea 48(%rsp),%rax
@ -2622,10 +2643,15 @@ bn_powerx5:
.Lpowerx5_body: .Lpowerx5_body:
call __bn_sqrx8x_internal call __bn_sqrx8x_internal
call __bn_postx4x_internal
call __bn_sqrx8x_internal call __bn_sqrx8x_internal
call __bn_postx4x_internal
call __bn_sqrx8x_internal call __bn_sqrx8x_internal
call __bn_postx4x_internal
call __bn_sqrx8x_internal call __bn_sqrx8x_internal
call __bn_postx4x_internal
call __bn_sqrx8x_internal call __bn_sqrx8x_internal
call __bn_postx4x_internal
mov %r10,$num # -num mov %r10,$num # -num
mov $aptr,$rptr mov $aptr,$rptr
@ -3071,7 +3097,7 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
$code.=<<___; $code.=<<___;
movq %xmm2,$nptr movq %xmm2,$nptr
sqrx8x_reduction: __bn_sqrx8x_reduction:
xor %eax,%eax # initial top-most carry bit xor %eax,%eax # initial top-most carry bit
mov 32+8(%rsp),%rbx # n0 mov 32+8(%rsp),%rbx # n0
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
@ -3279,6 +3305,8 @@ sqrx8x_reduction:
lea 8*8($tptr,%rcx),$tptr # start of current t[] window lea 8*8($tptr,%rcx),$tptr # start of current t[] window
cmp 8+8(%rsp),%r8 # end of t[]? cmp 8+8(%rsp),%r8 # end of t[]?
jb .Lsqrx8x_reduction_loop jb .Lsqrx8x_reduction_loop
ret
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
___ ___
} }
############################################################## ##############################################################
@ -3286,15 +3314,11 @@ ___
# #
{ {
my ($rptr,$nptr)=("%rdx","%rbp"); my ($rptr,$nptr)=("%rdx","%rbp");
my @ri=map("%r$_",(10..13));
my @ni=map("%r$_",(14..15));
$code.=<<___; $code.=<<___;
.align 32
__bn_postx4x_internal:
mov 8*0($nptr),%r12 mov 8*0($nptr),%r12
xor %ebx,%ebx
sub %r15,%rsi # compare top-most words
adc %rbx,%rbx
mov %rcx,%r10 # -$num mov %rcx,%r10 # -$num
or %rbx,%rax
mov %rcx,%r9 # -$num mov %rcx,%r9 # -$num
neg %rax neg %rax
sar \$3+2,%rcx sar \$3+2,%rcx
@ -3308,6 +3332,7 @@ $code.=<<___;
mov 8*3($nptr),%r15 mov 8*3($nptr),%r15
jmp .Lsqrx4x_sub_entry jmp .Lsqrx4x_sub_entry
.align 16
.Lsqrx4x_sub: .Lsqrx4x_sub:
mov 8*0($nptr),%r12 mov 8*0($nptr),%r12
mov 8*1($nptr),%r13 mov 8*1($nptr),%r13
@ -3335,14 +3360,13 @@ $code.=<<___;
inc %rcx inc %rcx
jnz .Lsqrx4x_sub jnz .Lsqrx4x_sub
___
}
$code.=<<___;
neg %r9 # restore $num neg %r9 # restore $num
ret ret
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal .size __bn_postx4x_internal,.-__bn_postx4x_internal
___ ___
}
}}} }}}
{ {
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
@ -3483,9 +3507,6 @@ ___
} }
$code.=<<___; $code.=<<___;
.align 64 .align 64
.Lmagic_masks:
.long 0x00,0x00,0x01,0x01, 0x08,0x08,0x09,0x09
.long 0x10,0x10,0x11,0x11, 0x18,0x18,0x19,0x19
.Linc: .Linc:
.long 0,0, 1,1 .long 0,0, 1,1
.long 2,2, 2,2 .long 2,2, 2,2
@ -3541,13 +3562,6 @@ mul_handler:
mov 192($context),%r10 # pull $num mov 192($context),%r10 # pull $num
mov 8(%rax,%r10,8),%rax # pull saved stack pointer mov 8(%rax,%r10,8),%rax # pull saved stack pointer
movaps -104(%rax),%xmm0
movaps -88(%rax),%xmm1
movaps -72(%rax),%xmm2
movups %xmm0,512($context) # restore context->Xmm6
movups %xmm1,528($context) # restore context->Xmm7
movups %xmm2,544($context) # restore context->Xmm8
jmp .Lbody_proceed jmp .Lbody_proceed
.Lbody_40: .Lbody_40:
@ -3675,8 +3689,9 @@ ___
$code.=<<___; $code.=<<___;
.align 8 .align 8
.LSEH_info_bn_gather5: .LSEH_info_bn_gather5:
.byte 0x01,0x0b,0x02,0x00 .byte 0x01,0x0b,0x03,0x0a
.byte 0x0b,0x01,0x21,0x00 #sub rsp,0x108 .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
.byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
.align 8 .align 8
___ ___
} }