bn/asm/x86_64-mont5.pl: unify gather procedure in hardly used path
and reorganize/harmonize post-conditions. Additional hardening following on from CVE-2016-0702 Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Rich Salz <rsalz@openssl.org> (cherry picked from master)
This commit is contained in:
parent
25d14c6c29
commit
515f3be47a
@ -814,34 +814,87 @@ $code.=<<___ if ($addx);
|
||||
jne .Lsqr8x_nox
|
||||
|
||||
call bn_sqrx8x_internal # see x86_64-mont5 module
|
||||
|
||||
pxor %xmm0,%xmm0
|
||||
lea 48(%rsp),%rax
|
||||
shr \$3+2,$num
|
||||
mov 40(%rsp),%rsi # restore %rsp
|
||||
jmp .Lsqr8x_zero
|
||||
# %rax top-most carry
|
||||
# %rbp nptr
|
||||
# %rcx -8*num
|
||||
# %r8 end of tp[2*num]
|
||||
lea (%r8,%rcx),%rbx
|
||||
mov %rcx,$num
|
||||
mov %rcx,%rdx
|
||||
movq %xmm1,$rptr
|
||||
sar \$3+2,%rcx # %cf=0
|
||||
jmp .Lsqr8x_sub
|
||||
|
||||
.align 32
|
||||
.Lsqr8x_nox:
|
||||
___
|
||||
$code.=<<___;
|
||||
call bn_sqr8x_internal # see x86_64-mont5 module
|
||||
|
||||
pxor %xmm0,%xmm0
|
||||
lea 48(%rsp),%rax
|
||||
shr \$3+2,$num
|
||||
mov 40(%rsp),%rsi # restore %rsp
|
||||
jmp .Lsqr8x_zero
|
||||
# %rax top-most carry
|
||||
# %rbp nptr
|
||||
# %r8 -8*num
|
||||
# %rdi end of tp[2*num]
|
||||
lea (%rdi,$num),%rbx
|
||||
mov $num,%rcx
|
||||
mov $num,%rdx
|
||||
movq %xmm1,$rptr
|
||||
sar \$3+2,%rcx # %cf=0
|
||||
jmp .Lsqr8x_sub
|
||||
|
||||
.align 32
|
||||
.Lsqr8x_zero:
|
||||
movdqa %xmm0,16*0(%rax) # wipe t
|
||||
movdqa %xmm0,16*1(%rax)
|
||||
movdqa %xmm0,16*2(%rax)
|
||||
movdqa %xmm0,16*3(%rax)
|
||||
lea 16*4(%rax),%rax
|
||||
dec $num
|
||||
jnz .Lsqr8x_zero
|
||||
.Lsqr8x_sub:
|
||||
mov 8*0(%rbx),%r12
|
||||
mov 8*1(%rbx),%r13
|
||||
mov 8*2(%rbx),%r14
|
||||
mov 8*3(%rbx),%r15
|
||||
lea 8*4(%rbx),%rbx
|
||||
sbb 8*0(%rbp),%r12
|
||||
sbb 8*1(%rbp),%r13
|
||||
sbb 8*2(%rbp),%r14
|
||||
sbb 8*3(%rbp),%r15
|
||||
lea 8*4(%rbp),%rbp
|
||||
mov %r12,8*0($rptr)
|
||||
mov %r13,8*1($rptr)
|
||||
mov %r14,8*2($rptr)
|
||||
mov %r15,8*3($rptr)
|
||||
lea 8*4($rptr),$rptr
|
||||
inc %rcx # preserves %cf
|
||||
jnz .Lsqr8x_sub
|
||||
|
||||
sbb \$0,%rax # top-most carry
|
||||
lea (%rbx,$num),%rbx # rewind
|
||||
lea ($rptr,$num),$rptr # rewind
|
||||
|
||||
movq %rax,%xmm1
|
||||
pxor %xmm0,%xmm0
|
||||
pshufd \$0,%xmm1,%xmm1
|
||||
mov 40(%rsp),%rsi # restore %rsp
|
||||
jmp .Lsqr8x_cond_copy
|
||||
|
||||
.align 32
|
||||
.Lsqr8x_cond_copy:
|
||||
movdqa 16*0(%rbx),%xmm2
|
||||
movdqa 16*1(%rbx),%xmm3
|
||||
lea 16*2(%rbx),%rbx
|
||||
movdqu 16*0($rptr),%xmm4
|
||||
movdqu 16*1($rptr),%xmm5
|
||||
lea 16*2($rptr),$rptr
|
||||
movdqa %xmm0,-16*2(%rbx) # zero tp
|
||||
movdqa %xmm0,-16*1(%rbx)
|
||||
movdqa %xmm0,-16*2(%rbx,%rdx)
|
||||
movdqa %xmm0,-16*1(%rbx,%rdx)
|
||||
pcmpeqd %xmm1,%xmm0
|
||||
pand %xmm1,%xmm2
|
||||
pand %xmm1,%xmm3
|
||||
pand %xmm0,%xmm4
|
||||
pand %xmm0,%xmm5
|
||||
pxor %xmm0,%xmm0
|
||||
por %xmm2,%xmm4
|
||||
por %xmm3,%xmm5
|
||||
movdqu %xmm4,-16*2($rptr)
|
||||
movdqu %xmm5,-16*1($rptr)
|
||||
add \$32,$num
|
||||
jnz .Lsqr8x_cond_copy
|
||||
|
||||
mov \$1,%rax
|
||||
mov -48(%rsi),%r15
|
||||
@ -1108,64 +1161,75 @@ $code.=<<___;
|
||||
adc $zero,%r15 # modulo-scheduled
|
||||
sub 0*8($tptr),$zero # pull top-most carry
|
||||
adc %r15,%r14
|
||||
mov -8($nptr),$mi
|
||||
sbb %r15,%r15 # top-most carry
|
||||
mov %r14,-1*8($tptr)
|
||||
|
||||
cmp 16(%rsp),$bptr
|
||||
jne .Lmulx4x_outer
|
||||
|
||||
sub %r14,$mi # compare top-most words
|
||||
sbb $mi,$mi
|
||||
or $mi,%r15
|
||||
|
||||
neg $num
|
||||
xor %rdx,%rdx
|
||||
mov 32(%rsp),$rptr # restore rp
|
||||
lea 64(%rsp),$tptr
|
||||
|
||||
pxor %xmm0,%xmm0
|
||||
mov 0*8($nptr,$num),%r8
|
||||
mov 1*8($nptr,$num),%r9
|
||||
neg %r8
|
||||
jmp .Lmulx4x_sub_entry
|
||||
sub $num,$nptr # rewind $nptr
|
||||
neg %r15
|
||||
mov $num,%rdx
|
||||
shr \$3+2,$num # %cf=0
|
||||
mov 32(%rsp),$rptr # restore rp
|
||||
jmp .Lmulx4x_sub
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_sub:
|
||||
mov 0*8($nptr,$num),%r8
|
||||
mov 1*8($nptr,$num),%r9
|
||||
not %r8
|
||||
.Lmulx4x_sub_entry:
|
||||
mov 2*8($nptr,$num),%r10
|
||||
not %r9
|
||||
and %r15,%r8
|
||||
mov 3*8($nptr,$num),%r11
|
||||
not %r10
|
||||
and %r15,%r9
|
||||
not %r11
|
||||
and %r15,%r10
|
||||
and %r15,%r11
|
||||
|
||||
neg %rdx # mov %rdx,%cf
|
||||
adc 0*8($tptr),%r8
|
||||
adc 1*8($tptr),%r9
|
||||
movdqa %xmm0,($tptr)
|
||||
adc 2*8($tptr),%r10
|
||||
adc 3*8($tptr),%r11
|
||||
movdqa %xmm0,16($tptr)
|
||||
lea 4*8($tptr),$tptr
|
||||
sbb %rdx,%rdx # mov %cf,%rdx
|
||||
|
||||
mov %r8,0*8($rptr)
|
||||
mov %r9,1*8($rptr)
|
||||
mov %r10,2*8($rptr)
|
||||
mov %r11,3*8($rptr)
|
||||
lea 4*8($rptr),$rptr
|
||||
|
||||
add \$32,$num
|
||||
mov 8*0($tptr),%r11
|
||||
mov 8*1($tptr),%r12
|
||||
mov 8*2($tptr),%r13
|
||||
mov 8*3($tptr),%r14
|
||||
lea 8*4($tptr),$tptr
|
||||
sbb 8*0($nptr),%r11
|
||||
sbb 8*1($nptr),%r12
|
||||
sbb 8*2($nptr),%r13
|
||||
sbb 8*3($nptr),%r14
|
||||
lea 8*4($nptr),$nptr
|
||||
mov %r11,8*0($rptr)
|
||||
mov %r12,8*1($rptr)
|
||||
mov %r13,8*2($rptr)
|
||||
mov %r14,8*3($rptr)
|
||||
lea 8*4($rptr),$rptr
|
||||
dec $num # preserves %cf
|
||||
jnz .Lmulx4x_sub
|
||||
|
||||
sbb \$0,%r15 # top-most carry
|
||||
lea 64(%rsp),$tptr
|
||||
sub %rdx,$rptr # rewind
|
||||
|
||||
movq %r15,%xmm1
|
||||
pxor %xmm0,%xmm0
|
||||
pshufd \$0,%xmm1,%xmm1
|
||||
mov 40(%rsp),%rsi # restore %rsp
|
||||
jmp .Lmulx4x_cond_copy
|
||||
|
||||
.align 32
|
||||
.Lmulx4x_cond_copy:
|
||||
movdqa 16*0($tptr),%xmm2
|
||||
movdqa 16*1($tptr),%xmm3
|
||||
lea 16*2($tptr),$tptr
|
||||
movdqu 16*0($rptr),%xmm4
|
||||
movdqu 16*1($rptr),%xmm5
|
||||
lea 16*2($rptr),$rptr
|
||||
movdqa %xmm0,-16*2($tptr) # zero tp
|
||||
movdqa %xmm0,-16*1($tptr)
|
||||
pcmpeqd %xmm1,%xmm0
|
||||
pand %xmm1,%xmm2
|
||||
pand %xmm1,%xmm3
|
||||
pand %xmm0,%xmm4
|
||||
pand %xmm0,%xmm5
|
||||
pxor %xmm0,%xmm0
|
||||
por %xmm2,%xmm4
|
||||
por %xmm3,%xmm5
|
||||
movdqu %xmm4,-16*2($rptr)
|
||||
movdqu %xmm5,-16*1($rptr)
|
||||
sub \$32,%rdx
|
||||
jnz .Lmulx4x_cond_copy
|
||||
|
||||
mov %rdx,($tptr)
|
||||
|
||||
mov \$1,%rax
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
|
@ -99,25 +99,18 @@ $code.=<<___;
|
||||
.Lmul_enter:
|
||||
mov ${num}d,${num}d
|
||||
mov %rsp,%rax
|
||||
movd `($win64?56:8)`(%rsp),%xmm0 # load 7th argument
|
||||
lea .Lmagic_masks(%rip),%r10
|
||||
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
|
||||
lea .Linc(%rip),%r10
|
||||
push %rbx
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea -0x38(%rsp),%rsp
|
||||
movaps %xmm6,(%rsp)
|
||||
movaps %xmm7,0x10(%rsp)
|
||||
movaps %xmm8,0x20(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
|
||||
lea 2($num),%r11
|
||||
neg %r11
|
||||
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
|
||||
lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
|
||||
and \$-1024,%rsp # minimize TLB usage
|
||||
|
||||
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||||
@ -128,64 +121,89 @@ ___
|
||||
$STRIDE=2**5*8; # 5 is "window size"
|
||||
$N=$STRIDE/4; # should match cache line size
|
||||
$code.=<<___;
|
||||
################################################################
|
||||
# calculate mask: one of %xmm4..7 will contain 0xff..00 or
|
||||
# 0x00..ff denoting which half of a quarter of corresponding
|
||||
# cache line is significant.
|
||||
#
|
||||
movq 56(%r10),%xmm1 # 0b11001
|
||||
movq %xmm0,%rdx
|
||||
pand %xmm1,%xmm0
|
||||
movdqa 0(%r10),%xmm4
|
||||
pshufd \$0,%xmm0,%xmm0 # broadcast masked index
|
||||
movdqa 16(%r10),%xmm5
|
||||
movdqa 32(%r10),%xmm6
|
||||
pcmpeqd %xmm0,%xmm4
|
||||
movdqa 48(%r10),%xmm7
|
||||
pcmpeqd %xmm0,%xmm5
|
||||
pcmpeqd %xmm0,%xmm6
|
||||
pcmpeqd %xmm0,%xmm7
|
||||
movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
|
||||
movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
|
||||
lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
|
||||
and \$-16,%r10
|
||||
|
||||
################################################################
|
||||
# calculate index in 1st cache line, but in such manner that
|
||||
# if target data is in another cache line, then relevant
|
||||
# "rotating" reference would land on it...
|
||||
#
|
||||
shr \$1,%rdx # idx/=2
|
||||
mov %rdx,$j
|
||||
shr \$2,%rdx
|
||||
sub %rdx,$j
|
||||
and \$3,$j # (idx-idx/4)%4
|
||||
shl \$4,$j # scale for xmm references
|
||||
pshufd \$0,%xmm5,%xmm5 # broadcast index
|
||||
movdqa %xmm1,%xmm4
|
||||
movdqa %xmm1,%xmm2
|
||||
___
|
||||
########################################################################
|
||||
# calculate mask by comparing 0..31 to index and save result to stack
|
||||
#
|
||||
$code.=<<___;
|
||||
paddd %xmm0,%xmm1
|
||||
pcmpeqd %xmm5,%xmm0 # compare to 1,0
|
||||
.byte 0x67
|
||||
movdqa %xmm4,%xmm3
|
||||
___
|
||||
for($k=0;$k<$STRIDE/16-4;$k+=4) {
|
||||
$code.=<<___;
|
||||
paddd %xmm1,%xmm2
|
||||
pcmpeqd %xmm5,%xmm1 # compare to 3,2
|
||||
movdqa %xmm0,`16*($k+0)+112`(%r10)
|
||||
movdqa %xmm4,%xmm0
|
||||
|
||||
################################################################
|
||||
# "rotating" references are touching different cache banks in
|
||||
# different cache lines, so that not only all cache lines are
|
||||
# referred in each iteration, but even all cache banks.
|
||||
#
|
||||
lea 16($j),$m0
|
||||
lea 32($j),$m1
|
||||
and \$63,$m0
|
||||
lea 48($j),%rdx
|
||||
and \$63,$m1
|
||||
and \$63,%rdx
|
||||
movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0
|
||||
movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1
|
||||
movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2
|
||||
movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3
|
||||
pand %xmm4,%xmm0
|
||||
pand %xmm5,%xmm1
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
paddd %xmm2,%xmm3
|
||||
pcmpeqd %xmm5,%xmm2 # compare to 5,4
|
||||
movdqa %xmm1,`16*($k+1)+112`(%r10)
|
||||
movdqa %xmm4,%xmm1
|
||||
|
||||
paddd %xmm3,%xmm0
|
||||
pcmpeqd %xmm5,%xmm3 # compare to 7,6
|
||||
movdqa %xmm2,`16*($k+2)+112`(%r10)
|
||||
movdqa %xmm4,%xmm2
|
||||
|
||||
paddd %xmm0,%xmm1
|
||||
pcmpeqd %xmm5,%xmm0
|
||||
movdqa %xmm3,`16*($k+3)+112`(%r10)
|
||||
movdqa %xmm4,%xmm3
|
||||
___
|
||||
}
|
||||
$code.=<<___; # last iteration can be optimized
|
||||
paddd %xmm1,%xmm2
|
||||
pcmpeqd %xmm5,%xmm1
|
||||
movdqa %xmm0,`16*($k+0)+112`(%r10)
|
||||
|
||||
paddd %xmm2,%xmm3
|
||||
.byte 0x67
|
||||
pcmpeqd %xmm5,%xmm2
|
||||
movdqa %xmm1,`16*($k+1)+112`(%r10)
|
||||
|
||||
pcmpeqd %xmm5,%xmm3
|
||||
movdqa %xmm2,`16*($k+2)+112`(%r10)
|
||||
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
|
||||
|
||||
pand `16*($k+1)-128`($bp),%xmm1
|
||||
pand `16*($k+2)-128`($bp),%xmm2
|
||||
movdqa %xmm3,`16*($k+3)+112`(%r10)
|
||||
pand `16*($k+3)-128`($bp),%xmm3
|
||||
por %xmm2,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
movq $j,%xmm8
|
||||
|
||||
por %xmm3,%xmm1
|
||||
___
|
||||
for($k=0;$k<$STRIDE/16-4;$k+=4) {
|
||||
$code.=<<___;
|
||||
movdqa `16*($k+0)-128`($bp),%xmm4
|
||||
movdqa `16*($k+1)-128`($bp),%xmm5
|
||||
movdqa `16*($k+2)-128`($bp),%xmm2
|
||||
pand `16*($k+0)+112`(%r10),%xmm4
|
||||
movdqa `16*($k+3)-128`($bp),%xmm3
|
||||
pand `16*($k+1)+112`(%r10),%xmm5
|
||||
por %xmm4,%xmm0
|
||||
pand `16*($k+2)+112`(%r10),%xmm2
|
||||
por %xmm5,%xmm1
|
||||
pand `16*($k+3)+112`(%r10),%xmm3
|
||||
por %xmm2,%xmm0
|
||||
por %xmm3,%xmm1
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
por %xmm1,%xmm0
|
||||
pshufd \$0x4e,%xmm0,%xmm1
|
||||
por %xmm1,%xmm0 # merge upper and lower halves
|
||||
|
||||
por %xmm1,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
movq %xmm0,$m0 # m0=bp[0]
|
||||
|
||||
mov ($n0),$n0 # pull n0[0] value
|
||||
@ -232,15 +250,14 @@ $code.=<<___;
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
cmp $num,$j
|
||||
jne .L1st
|
||||
|
||||
movq %xmm8,$j
|
||||
jne .L1st # note that upon exit $j==$num, so
|
||||
# they can be used interchangeably
|
||||
|
||||
add %rax,$hi1
|
||||
adc \$0,%rdx
|
||||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-16(%rsp,$num,8) # tp[j-1]
|
||||
mov $hi1,-16(%rsp,$num,8) # tp[num-1]
|
||||
mov %rdx,$hi1
|
||||
mov $lo0,$hi0
|
||||
|
||||
@ -254,27 +271,32 @@ $code.=<<___;
|
||||
jmp .Louter
|
||||
.align 16
|
||||
.Louter:
|
||||
lea 16($j),$m0
|
||||
lea 32($j),$m1
|
||||
and \$63,$m0
|
||||
lea 48($j),%rdx
|
||||
and \$63,$m1
|
||||
and \$63,%rdx
|
||||
movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0
|
||||
movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1
|
||||
movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2
|
||||
movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3
|
||||
pand %xmm4,%xmm0
|
||||
pand %xmm5,%xmm1
|
||||
pand %xmm6,%xmm2
|
||||
por %xmm1,%xmm0
|
||||
pand %xmm7,%xmm3
|
||||
por %xmm2,%xmm0
|
||||
lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
|
||||
and \$-16,%rdx
|
||||
pxor %xmm4,%xmm4
|
||||
pxor %xmm5,%xmm5
|
||||
___
|
||||
for($k=0;$k<$STRIDE/16;$k+=4) {
|
||||
$code.=<<___;
|
||||
movdqa `16*($k+0)-128`($bp),%xmm0
|
||||
movdqa `16*($k+1)-128`($bp),%xmm1
|
||||
movdqa `16*($k+2)-128`($bp),%xmm2
|
||||
movdqa `16*($k+3)-128`($bp),%xmm3
|
||||
pand `16*($k+0)-128`(%rdx),%xmm0
|
||||
pand `16*($k+1)-128`(%rdx),%xmm1
|
||||
por %xmm0,%xmm4
|
||||
pand `16*($k+2)-128`(%rdx),%xmm2
|
||||
por %xmm1,%xmm5
|
||||
pand `16*($k+3)-128`(%rdx),%xmm3
|
||||
por %xmm2,%xmm4
|
||||
por %xmm3,%xmm5
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
por %xmm5,%xmm4
|
||||
pshufd \$0x4e,%xmm4,%xmm0
|
||||
por %xmm4,%xmm0
|
||||
lea $STRIDE($bp),$bp
|
||||
por %xmm3,%xmm0
|
||||
|
||||
pshufd \$0x4e,%xmm0,%xmm1
|
||||
por %xmm1,%xmm0 # merge upper and lower halves
|
||||
|
||||
mov ($ap),%rax # ap[0]
|
||||
movq %xmm0,$m0 # m0=bp[i]
|
||||
@ -324,16 +346,14 @@ $code.=<<___;
|
||||
|
||||
mulq $m1 # np[j]*m1
|
||||
cmp $num,$j
|
||||
jne .Linner
|
||||
|
||||
movq %xmm8,$j
|
||||
|
||||
jne .Linner # note that upon exit $j==$num, so
|
||||
# they can be used interchangeably
|
||||
add %rax,$hi1
|
||||
adc \$0,%rdx
|
||||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||
mov (%rsp,$num,8),$lo0
|
||||
adc \$0,%rdx
|
||||
mov $hi1,-16(%rsp,$num,8) # tp[j-1]
|
||||
mov $hi1,-16(%rsp,$num,8) # tp[num-1]
|
||||
mov %rdx,$hi1
|
||||
|
||||
xor %rdx,%rdx
|
||||
@ -380,13 +400,7 @@ $code.=<<___;
|
||||
|
||||
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
||||
mov \$1,%rax
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps -104(%rsi),%xmm6
|
||||
movaps -88(%rsi),%xmm7
|
||||
movaps -72(%rsi),%xmm8
|
||||
___
|
||||
$code.=<<___;
|
||||
|
||||
mov -48(%rsi),%r15
|
||||
mov -40(%rsi),%r14
|
||||
mov -32(%rsi),%r13
|
||||
@ -1065,10 +1079,15 @@ $code.=<<___;
|
||||
movq $bptr,%xmm4
|
||||
|
||||
call __bn_sqr8x_internal
|
||||
call __bn_post4x_internal
|
||||
call __bn_sqr8x_internal
|
||||
call __bn_post4x_internal
|
||||
call __bn_sqr8x_internal
|
||||
call __bn_post4x_internal
|
||||
call __bn_sqr8x_internal
|
||||
call __bn_post4x_internal
|
||||
call __bn_sqr8x_internal
|
||||
call __bn_post4x_internal
|
||||
|
||||
movq %xmm2,$nptr
|
||||
movq %xmm4,$bptr
|
||||
@ -1629,7 +1648,7 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
|
||||
|
||||
$code.=<<___;
|
||||
movq %xmm2,$nptr
|
||||
sqr8x_reduction:
|
||||
__bn_sqr8x_reduction:
|
||||
xor %rax,%rax
|
||||
lea ($nptr,$num),%rcx # end of n[]
|
||||
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
|
||||
@ -1888,6 +1907,8 @@ sqr8x_reduction:
|
||||
|
||||
cmp %rdx,$tptr # end of t[]?
|
||||
jb .L8x_reduction_loop
|
||||
ret
|
||||
.size bn_sqr8x_internal,.-bn_sqr8x_internal
|
||||
___
|
||||
}
|
||||
##############################################################
|
||||
@ -1896,13 +1917,12 @@ ___
|
||||
{
|
||||
my ($tptr,$nptr)=("%rbx","%rbp");
|
||||
$code.=<<___;
|
||||
#xor %rsi,%rsi # %rsi was $carry above
|
||||
.type __bn_post4x_internal,\@abi-omnipotent
|
||||
.align 32
|
||||
__bn_post4x_internal:
|
||||
mov 8*0($nptr),%r12
|
||||
sub %r15,%rcx # compare top-most words
|
||||
lea (%rdi,$num),$tptr # %rdi was $tptr above
|
||||
adc %rsi,%rsi
|
||||
mov $num,%rcx
|
||||
or %rsi,%rax
|
||||
movq %xmm1,$rptr # restore $rptr
|
||||
neg %rax
|
||||
movq %xmm1,$aptr # prepare for back-to-back call
|
||||
@ -1946,14 +1966,13 @@ $code.=<<___;
|
||||
|
||||
inc %rcx # pass %cf
|
||||
jnz .Lsqr4x_sub
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
|
||||
mov $num,%r10 # prepare for back-to-back call
|
||||
neg $num # restore $num
|
||||
ret
|
||||
.size bn_sqr8x_internal,.-bn_sqr8x_internal
|
||||
.size __bn_post4x_internal,.-__bn_post4x_internal
|
||||
___
|
||||
}
|
||||
{
|
||||
$code.=<<___;
|
||||
.globl bn_from_montgomery
|
||||
@ -2061,7 +2080,8 @@ $code.=<<___ if ($addx);
|
||||
jne .Lfrom_mont_nox
|
||||
|
||||
lea (%rax,$num),$rptr
|
||||
call sqrx8x_reduction
|
||||
call __bn_sqrx8x_reduction
|
||||
call __bn_postx4x_internal
|
||||
|
||||
pxor %xmm0,%xmm0
|
||||
lea 48(%rsp),%rax
|
||||
@ -2072,7 +2092,8 @@ $code.=<<___ if ($addx);
|
||||
.Lfrom_mont_nox:
|
||||
___
|
||||
$code.=<<___;
|
||||
call sqr8x_reduction
|
||||
call __bn_sqr8x_reduction
|
||||
call __bn_post4x_internal
|
||||
|
||||
pxor %xmm0,%xmm0
|
||||
lea 48(%rsp),%rax
|
||||
@ -2622,10 +2643,15 @@ bn_powerx5:
|
||||
.Lpowerx5_body:
|
||||
|
||||
call __bn_sqrx8x_internal
|
||||
call __bn_postx4x_internal
|
||||
call __bn_sqrx8x_internal
|
||||
call __bn_postx4x_internal
|
||||
call __bn_sqrx8x_internal
|
||||
call __bn_postx4x_internal
|
||||
call __bn_sqrx8x_internal
|
||||
call __bn_postx4x_internal
|
||||
call __bn_sqrx8x_internal
|
||||
call __bn_postx4x_internal
|
||||
|
||||
mov %r10,$num # -num
|
||||
mov $aptr,$rptr
|
||||
@ -3071,7 +3097,7 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
|
||||
|
||||
$code.=<<___;
|
||||
movq %xmm2,$nptr
|
||||
sqrx8x_reduction:
|
||||
__bn_sqrx8x_reduction:
|
||||
xor %eax,%eax # initial top-most carry bit
|
||||
mov 32+8(%rsp),%rbx # n0
|
||||
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
|
||||
@ -3279,6 +3305,8 @@ sqrx8x_reduction:
|
||||
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
|
||||
cmp 8+8(%rsp),%r8 # end of t[]?
|
||||
jb .Lsqrx8x_reduction_loop
|
||||
ret
|
||||
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
|
||||
___
|
||||
}
|
||||
##############################################################
|
||||
@ -3286,15 +3314,11 @@ ___
|
||||
#
|
||||
{
|
||||
my ($rptr,$nptr)=("%rdx","%rbp");
|
||||
my @ri=map("%r$_",(10..13));
|
||||
my @ni=map("%r$_",(14..15));
|
||||
$code.=<<___;
|
||||
.align 32
|
||||
__bn_postx4x_internal:
|
||||
mov 8*0($nptr),%r12
|
||||
xor %ebx,%ebx
|
||||
sub %r15,%rsi # compare top-most words
|
||||
adc %rbx,%rbx
|
||||
mov %rcx,%r10 # -$num
|
||||
or %rbx,%rax
|
||||
mov %rcx,%r9 # -$num
|
||||
neg %rax
|
||||
sar \$3+2,%rcx
|
||||
@ -3308,6 +3332,7 @@ $code.=<<___;
|
||||
mov 8*3($nptr),%r15
|
||||
jmp .Lsqrx4x_sub_entry
|
||||
|
||||
.align 16
|
||||
.Lsqrx4x_sub:
|
||||
mov 8*0($nptr),%r12
|
||||
mov 8*1($nptr),%r13
|
||||
@ -3335,14 +3360,13 @@ $code.=<<___;
|
||||
|
||||
inc %rcx
|
||||
jnz .Lsqrx4x_sub
|
||||
___
|
||||
}
|
||||
$code.=<<___;
|
||||
|
||||
neg %r9 # restore $num
|
||||
|
||||
ret
|
||||
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
|
||||
.size __bn_postx4x_internal,.-__bn_postx4x_internal
|
||||
___
|
||||
}
|
||||
}}}
|
||||
{
|
||||
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
|
||||
@ -3483,9 +3507,6 @@ ___
|
||||
}
|
||||
$code.=<<___;
|
||||
.align 64
|
||||
.Lmagic_masks:
|
||||
.long 0x00,0x00,0x01,0x01, 0x08,0x08,0x09,0x09
|
||||
.long 0x10,0x10,0x11,0x11, 0x18,0x18,0x19,0x19
|
||||
.Linc:
|
||||
.long 0,0, 1,1
|
||||
.long 2,2, 2,2
|
||||
@ -3541,13 +3562,6 @@ mul_handler:
|
||||
mov 192($context),%r10 # pull $num
|
||||
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
||||
|
||||
movaps -104(%rax),%xmm0
|
||||
movaps -88(%rax),%xmm1
|
||||
movaps -72(%rax),%xmm2
|
||||
|
||||
movups %xmm0,512($context) # restore context->Xmm6
|
||||
movups %xmm1,528($context) # restore context->Xmm7
|
||||
movups %xmm2,544($context) # restore context->Xmm8
|
||||
jmp .Lbody_proceed
|
||||
|
||||
.Lbody_40:
|
||||
@ -3675,8 +3689,9 @@ ___
|
||||
$code.=<<___;
|
||||
.align 8
|
||||
.LSEH_info_bn_gather5:
|
||||
.byte 0x01,0x0b,0x02,0x00
|
||||
.byte 0x0b,0x01,0x21,0x00 #sub rsp,0x108
|
||||
.byte 0x01,0x0b,0x03,0x0a
|
||||
.byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
|
||||
.byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
|
||||
.align 8
|
||||
___
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user