bn/asm/x86_64-mont5.pl: unify gather procedure in hardly used path
and reorganize/harmonize post-conditions. Additional hardening following on from CVE-2016-0702 Reviewed-by: Richard Levitte <levitte@openssl.org> Reviewed-by: Rich Salz <rsalz@openssl.org> (cherry picked from master)
This commit is contained in:
parent
25d14c6c29
commit
515f3be47a
@ -795,7 +795,7 @@ bn_sqr8x_mont:
|
|||||||
sub %r11,%rsp
|
sub %r11,%rsp
|
||||||
.Lsqr8x_sp_done:
|
.Lsqr8x_sp_done:
|
||||||
and \$-64,%rsp
|
and \$-64,%rsp
|
||||||
mov $num,%r10
|
mov $num,%r10
|
||||||
neg $num
|
neg $num
|
||||||
|
|
||||||
mov $n0, 32(%rsp)
|
mov $n0, 32(%rsp)
|
||||||
@ -814,34 +814,87 @@ $code.=<<___ if ($addx);
|
|||||||
jne .Lsqr8x_nox
|
jne .Lsqr8x_nox
|
||||||
|
|
||||||
call bn_sqrx8x_internal # see x86_64-mont5 module
|
call bn_sqrx8x_internal # see x86_64-mont5 module
|
||||||
|
# %rax top-most carry
|
||||||
pxor %xmm0,%xmm0
|
# %rbp nptr
|
||||||
lea 48(%rsp),%rax
|
# %rcx -8*num
|
||||||
shr \$3+2,$num
|
# %r8 end of tp[2*num]
|
||||||
mov 40(%rsp),%rsi # restore %rsp
|
lea (%r8,%rcx),%rbx
|
||||||
jmp .Lsqr8x_zero
|
mov %rcx,$num
|
||||||
|
mov %rcx,%rdx
|
||||||
|
movq %xmm1,$rptr
|
||||||
|
sar \$3+2,%rcx # %cf=0
|
||||||
|
jmp .Lsqr8x_sub
|
||||||
|
|
||||||
.align 32
|
.align 32
|
||||||
.Lsqr8x_nox:
|
.Lsqr8x_nox:
|
||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
call bn_sqr8x_internal # see x86_64-mont5 module
|
call bn_sqr8x_internal # see x86_64-mont5 module
|
||||||
|
# %rax top-most carry
|
||||||
pxor %xmm0,%xmm0
|
# %rbp nptr
|
||||||
lea 48(%rsp),%rax
|
# %r8 -8*num
|
||||||
shr \$3+2,$num
|
# %rdi end of tp[2*num]
|
||||||
mov 40(%rsp),%rsi # restore %rsp
|
lea (%rdi,$num),%rbx
|
||||||
jmp .Lsqr8x_zero
|
mov $num,%rcx
|
||||||
|
mov $num,%rdx
|
||||||
|
movq %xmm1,$rptr
|
||||||
|
sar \$3+2,%rcx # %cf=0
|
||||||
|
jmp .Lsqr8x_sub
|
||||||
|
|
||||||
.align 32
|
.align 32
|
||||||
.Lsqr8x_zero:
|
.Lsqr8x_sub:
|
||||||
movdqa %xmm0,16*0(%rax) # wipe t
|
mov 8*0(%rbx),%r12
|
||||||
movdqa %xmm0,16*1(%rax)
|
mov 8*1(%rbx),%r13
|
||||||
movdqa %xmm0,16*2(%rax)
|
mov 8*2(%rbx),%r14
|
||||||
movdqa %xmm0,16*3(%rax)
|
mov 8*3(%rbx),%r15
|
||||||
lea 16*4(%rax),%rax
|
lea 8*4(%rbx),%rbx
|
||||||
dec $num
|
sbb 8*0(%rbp),%r12
|
||||||
jnz .Lsqr8x_zero
|
sbb 8*1(%rbp),%r13
|
||||||
|
sbb 8*2(%rbp),%r14
|
||||||
|
sbb 8*3(%rbp),%r15
|
||||||
|
lea 8*4(%rbp),%rbp
|
||||||
|
mov %r12,8*0($rptr)
|
||||||
|
mov %r13,8*1($rptr)
|
||||||
|
mov %r14,8*2($rptr)
|
||||||
|
mov %r15,8*3($rptr)
|
||||||
|
lea 8*4($rptr),$rptr
|
||||||
|
inc %rcx # preserves %cf
|
||||||
|
jnz .Lsqr8x_sub
|
||||||
|
|
||||||
|
sbb \$0,%rax # top-most carry
|
||||||
|
lea (%rbx,$num),%rbx # rewind
|
||||||
|
lea ($rptr,$num),$rptr # rewind
|
||||||
|
|
||||||
|
movq %rax,%xmm1
|
||||||
|
pxor %xmm0,%xmm0
|
||||||
|
pshufd \$0,%xmm1,%xmm1
|
||||||
|
mov 40(%rsp),%rsi # restore %rsp
|
||||||
|
jmp .Lsqr8x_cond_copy
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.Lsqr8x_cond_copy:
|
||||||
|
movdqa 16*0(%rbx),%xmm2
|
||||||
|
movdqa 16*1(%rbx),%xmm3
|
||||||
|
lea 16*2(%rbx),%rbx
|
||||||
|
movdqu 16*0($rptr),%xmm4
|
||||||
|
movdqu 16*1($rptr),%xmm5
|
||||||
|
lea 16*2($rptr),$rptr
|
||||||
|
movdqa %xmm0,-16*2(%rbx) # zero tp
|
||||||
|
movdqa %xmm0,-16*1(%rbx)
|
||||||
|
movdqa %xmm0,-16*2(%rbx,%rdx)
|
||||||
|
movdqa %xmm0,-16*1(%rbx,%rdx)
|
||||||
|
pcmpeqd %xmm1,%xmm0
|
||||||
|
pand %xmm1,%xmm2
|
||||||
|
pand %xmm1,%xmm3
|
||||||
|
pand %xmm0,%xmm4
|
||||||
|
pand %xmm0,%xmm5
|
||||||
|
pxor %xmm0,%xmm0
|
||||||
|
por %xmm2,%xmm4
|
||||||
|
por %xmm3,%xmm5
|
||||||
|
movdqu %xmm4,-16*2($rptr)
|
||||||
|
movdqu %xmm5,-16*1($rptr)
|
||||||
|
add \$32,$num
|
||||||
|
jnz .Lsqr8x_cond_copy
|
||||||
|
|
||||||
mov \$1,%rax
|
mov \$1,%rax
|
||||||
mov -48(%rsi),%r15
|
mov -48(%rsi),%r15
|
||||||
@ -1108,64 +1161,75 @@ $code.=<<___;
|
|||||||
adc $zero,%r15 # modulo-scheduled
|
adc $zero,%r15 # modulo-scheduled
|
||||||
sub 0*8($tptr),$zero # pull top-most carry
|
sub 0*8($tptr),$zero # pull top-most carry
|
||||||
adc %r15,%r14
|
adc %r15,%r14
|
||||||
mov -8($nptr),$mi
|
|
||||||
sbb %r15,%r15 # top-most carry
|
sbb %r15,%r15 # top-most carry
|
||||||
mov %r14,-1*8($tptr)
|
mov %r14,-1*8($tptr)
|
||||||
|
|
||||||
cmp 16(%rsp),$bptr
|
cmp 16(%rsp),$bptr
|
||||||
jne .Lmulx4x_outer
|
jne .Lmulx4x_outer
|
||||||
|
|
||||||
sub %r14,$mi # compare top-most words
|
|
||||||
sbb $mi,$mi
|
|
||||||
or $mi,%r15
|
|
||||||
|
|
||||||
neg $num
|
|
||||||
xor %rdx,%rdx
|
|
||||||
mov 32(%rsp),$rptr # restore rp
|
|
||||||
lea 64(%rsp),$tptr
|
lea 64(%rsp),$tptr
|
||||||
|
sub $num,$nptr # rewind $nptr
|
||||||
pxor %xmm0,%xmm0
|
neg %r15
|
||||||
mov 0*8($nptr,$num),%r8
|
mov $num,%rdx
|
||||||
mov 1*8($nptr,$num),%r9
|
shr \$3+2,$num # %cf=0
|
||||||
neg %r8
|
mov 32(%rsp),$rptr # restore rp
|
||||||
jmp .Lmulx4x_sub_entry
|
jmp .Lmulx4x_sub
|
||||||
|
|
||||||
.align 32
|
.align 32
|
||||||
.Lmulx4x_sub:
|
.Lmulx4x_sub:
|
||||||
mov 0*8($nptr,$num),%r8
|
mov 8*0($tptr),%r11
|
||||||
mov 1*8($nptr,$num),%r9
|
mov 8*1($tptr),%r12
|
||||||
not %r8
|
mov 8*2($tptr),%r13
|
||||||
.Lmulx4x_sub_entry:
|
mov 8*3($tptr),%r14
|
||||||
mov 2*8($nptr,$num),%r10
|
lea 8*4($tptr),$tptr
|
||||||
not %r9
|
sbb 8*0($nptr),%r11
|
||||||
and %r15,%r8
|
sbb 8*1($nptr),%r12
|
||||||
mov 3*8($nptr,$num),%r11
|
sbb 8*2($nptr),%r13
|
||||||
not %r10
|
sbb 8*3($nptr),%r14
|
||||||
and %r15,%r9
|
lea 8*4($nptr),$nptr
|
||||||
not %r11
|
mov %r11,8*0($rptr)
|
||||||
and %r15,%r10
|
mov %r12,8*1($rptr)
|
||||||
and %r15,%r11
|
mov %r13,8*2($rptr)
|
||||||
|
mov %r14,8*3($rptr)
|
||||||
neg %rdx # mov %rdx,%cf
|
lea 8*4($rptr),$rptr
|
||||||
adc 0*8($tptr),%r8
|
dec $num # preserves %cf
|
||||||
adc 1*8($tptr),%r9
|
|
||||||
movdqa %xmm0,($tptr)
|
|
||||||
adc 2*8($tptr),%r10
|
|
||||||
adc 3*8($tptr),%r11
|
|
||||||
movdqa %xmm0,16($tptr)
|
|
||||||
lea 4*8($tptr),$tptr
|
|
||||||
sbb %rdx,%rdx # mov %cf,%rdx
|
|
||||||
|
|
||||||
mov %r8,0*8($rptr)
|
|
||||||
mov %r9,1*8($rptr)
|
|
||||||
mov %r10,2*8($rptr)
|
|
||||||
mov %r11,3*8($rptr)
|
|
||||||
lea 4*8($rptr),$rptr
|
|
||||||
|
|
||||||
add \$32,$num
|
|
||||||
jnz .Lmulx4x_sub
|
jnz .Lmulx4x_sub
|
||||||
|
|
||||||
|
sbb \$0,%r15 # top-most carry
|
||||||
|
lea 64(%rsp),$tptr
|
||||||
|
sub %rdx,$rptr # rewind
|
||||||
|
|
||||||
|
movq %r15,%xmm1
|
||||||
|
pxor %xmm0,%xmm0
|
||||||
|
pshufd \$0,%xmm1,%xmm1
|
||||||
mov 40(%rsp),%rsi # restore %rsp
|
mov 40(%rsp),%rsi # restore %rsp
|
||||||
|
jmp .Lmulx4x_cond_copy
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.Lmulx4x_cond_copy:
|
||||||
|
movdqa 16*0($tptr),%xmm2
|
||||||
|
movdqa 16*1($tptr),%xmm3
|
||||||
|
lea 16*2($tptr),$tptr
|
||||||
|
movdqu 16*0($rptr),%xmm4
|
||||||
|
movdqu 16*1($rptr),%xmm5
|
||||||
|
lea 16*2($rptr),$rptr
|
||||||
|
movdqa %xmm0,-16*2($tptr) # zero tp
|
||||||
|
movdqa %xmm0,-16*1($tptr)
|
||||||
|
pcmpeqd %xmm1,%xmm0
|
||||||
|
pand %xmm1,%xmm2
|
||||||
|
pand %xmm1,%xmm3
|
||||||
|
pand %xmm0,%xmm4
|
||||||
|
pand %xmm0,%xmm5
|
||||||
|
pxor %xmm0,%xmm0
|
||||||
|
por %xmm2,%xmm4
|
||||||
|
por %xmm3,%xmm5
|
||||||
|
movdqu %xmm4,-16*2($rptr)
|
||||||
|
movdqu %xmm5,-16*1($rptr)
|
||||||
|
sub \$32,%rdx
|
||||||
|
jnz .Lmulx4x_cond_copy
|
||||||
|
|
||||||
|
mov %rdx,($tptr)
|
||||||
|
|
||||||
mov \$1,%rax
|
mov \$1,%rax
|
||||||
mov -48(%rsi),%r15
|
mov -48(%rsi),%r15
|
||||||
mov -40(%rsi),%r14
|
mov -40(%rsi),%r14
|
||||||
|
@ -99,25 +99,18 @@ $code.=<<___;
|
|||||||
.Lmul_enter:
|
.Lmul_enter:
|
||||||
mov ${num}d,${num}d
|
mov ${num}d,${num}d
|
||||||
mov %rsp,%rax
|
mov %rsp,%rax
|
||||||
movd `($win64?56:8)`(%rsp),%xmm0 # load 7th argument
|
movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument
|
||||||
lea .Lmagic_masks(%rip),%r10
|
lea .Linc(%rip),%r10
|
||||||
push %rbx
|
push %rbx
|
||||||
push %rbp
|
push %rbp
|
||||||
push %r12
|
push %r12
|
||||||
push %r13
|
push %r13
|
||||||
push %r14
|
push %r14
|
||||||
push %r15
|
push %r15
|
||||||
___
|
|
||||||
$code.=<<___ if ($win64);
|
|
||||||
lea -0x38(%rsp),%rsp
|
|
||||||
movaps %xmm6,(%rsp)
|
|
||||||
movaps %xmm7,0x10(%rsp)
|
|
||||||
movaps %xmm8,0x20(%rsp)
|
|
||||||
___
|
|
||||||
$code.=<<___;
|
|
||||||
lea 2($num),%r11
|
lea 2($num),%r11
|
||||||
neg %r11
|
neg %r11
|
||||||
lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
|
lea -264(%rsp,%r11,8),%rsp # tp=alloca(8*(num+2)+256+8)
|
||||||
and \$-1024,%rsp # minimize TLB usage
|
and \$-1024,%rsp # minimize TLB usage
|
||||||
|
|
||||||
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||||||
@ -128,64 +121,89 @@ ___
|
|||||||
$STRIDE=2**5*8; # 5 is "window size"
|
$STRIDE=2**5*8; # 5 is "window size"
|
||||||
$N=$STRIDE/4; # should match cache line size
|
$N=$STRIDE/4; # should match cache line size
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
################################################################
|
movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000
|
||||||
# calculate mask: one of %xmm4..7 will contain 0xff..00 or
|
movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002
|
||||||
# 0x00..ff denoting which half of a quarter of corresponding
|
lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
|
||||||
# cache line is significant.
|
and \$-16,%r10
|
||||||
#
|
|
||||||
movq 56(%r10),%xmm1 # 0b11001
|
|
||||||
movq %xmm0,%rdx
|
|
||||||
pand %xmm1,%xmm0
|
|
||||||
movdqa 0(%r10),%xmm4
|
|
||||||
pshufd \$0,%xmm0,%xmm0 # broadcast masked index
|
|
||||||
movdqa 16(%r10),%xmm5
|
|
||||||
movdqa 32(%r10),%xmm6
|
|
||||||
pcmpeqd %xmm0,%xmm4
|
|
||||||
movdqa 48(%r10),%xmm7
|
|
||||||
pcmpeqd %xmm0,%xmm5
|
|
||||||
pcmpeqd %xmm0,%xmm6
|
|
||||||
pcmpeqd %xmm0,%xmm7
|
|
||||||
|
|
||||||
################################################################
|
pshufd \$0,%xmm5,%xmm5 # broadcast index
|
||||||
# calculate index in 1st cache line, but in such manner that
|
movdqa %xmm1,%xmm4
|
||||||
# if target data is in another cache line, then relevant
|
movdqa %xmm1,%xmm2
|
||||||
# "rotating" reference would land on it...
|
___
|
||||||
#
|
########################################################################
|
||||||
shr \$1,%rdx # idx/=2
|
# calculate mask by comparing 0..31 to index and save result to stack
|
||||||
mov %rdx,$j
|
#
|
||||||
shr \$2,%rdx
|
$code.=<<___;
|
||||||
sub %rdx,$j
|
paddd %xmm0,%xmm1
|
||||||
and \$3,$j # (idx-idx/4)%4
|
pcmpeqd %xmm5,%xmm0 # compare to 1,0
|
||||||
shl \$4,$j # scale for xmm references
|
.byte 0x67
|
||||||
|
movdqa %xmm4,%xmm3
|
||||||
|
___
|
||||||
|
for($k=0;$k<$STRIDE/16-4;$k+=4) {
|
||||||
|
$code.=<<___;
|
||||||
|
paddd %xmm1,%xmm2
|
||||||
|
pcmpeqd %xmm5,%xmm1 # compare to 3,2
|
||||||
|
movdqa %xmm0,`16*($k+0)+112`(%r10)
|
||||||
|
movdqa %xmm4,%xmm0
|
||||||
|
|
||||||
################################################################
|
paddd %xmm2,%xmm3
|
||||||
# "rotating" references are touching different cache banks in
|
pcmpeqd %xmm5,%xmm2 # compare to 5,4
|
||||||
# different cache lines, so that not only all cache lines are
|
movdqa %xmm1,`16*($k+1)+112`(%r10)
|
||||||
# referred in each iteration, but even all cache banks.
|
movdqa %xmm4,%xmm1
|
||||||
#
|
|
||||||
lea 16($j),$m0
|
paddd %xmm3,%xmm0
|
||||||
lea 32($j),$m1
|
pcmpeqd %xmm5,%xmm3 # compare to 7,6
|
||||||
and \$63,$m0
|
movdqa %xmm2,`16*($k+2)+112`(%r10)
|
||||||
lea 48($j),%rdx
|
movdqa %xmm4,%xmm2
|
||||||
and \$63,$m1
|
|
||||||
and \$63,%rdx
|
paddd %xmm0,%xmm1
|
||||||
movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0
|
pcmpeqd %xmm5,%xmm0
|
||||||
movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1
|
movdqa %xmm3,`16*($k+3)+112`(%r10)
|
||||||
movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2
|
movdqa %xmm4,%xmm3
|
||||||
movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3
|
___
|
||||||
pand %xmm4,%xmm0
|
}
|
||||||
pand %xmm5,%xmm1
|
$code.=<<___; # last iteration can be optimized
|
||||||
pand %xmm6,%xmm2
|
paddd %xmm1,%xmm2
|
||||||
por %xmm1,%xmm0
|
pcmpeqd %xmm5,%xmm1
|
||||||
pand %xmm7,%xmm3
|
movdqa %xmm0,`16*($k+0)+112`(%r10)
|
||||||
|
|
||||||
|
paddd %xmm2,%xmm3
|
||||||
|
.byte 0x67
|
||||||
|
pcmpeqd %xmm5,%xmm2
|
||||||
|
movdqa %xmm1,`16*($k+1)+112`(%r10)
|
||||||
|
|
||||||
|
pcmpeqd %xmm5,%xmm3
|
||||||
|
movdqa %xmm2,`16*($k+2)+112`(%r10)
|
||||||
|
pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register
|
||||||
|
|
||||||
|
pand `16*($k+1)-128`($bp),%xmm1
|
||||||
|
pand `16*($k+2)-128`($bp),%xmm2
|
||||||
|
movdqa %xmm3,`16*($k+3)+112`(%r10)
|
||||||
|
pand `16*($k+3)-128`($bp),%xmm3
|
||||||
por %xmm2,%xmm0
|
por %xmm2,%xmm0
|
||||||
lea $STRIDE($bp),$bp
|
por %xmm3,%xmm1
|
||||||
por %xmm3,%xmm0
|
___
|
||||||
movq $j,%xmm8
|
for($k=0;$k<$STRIDE/16-4;$k+=4) {
|
||||||
|
$code.=<<___;
|
||||||
|
movdqa `16*($k+0)-128`($bp),%xmm4
|
||||||
|
movdqa `16*($k+1)-128`($bp),%xmm5
|
||||||
|
movdqa `16*($k+2)-128`($bp),%xmm2
|
||||||
|
pand `16*($k+0)+112`(%r10),%xmm4
|
||||||
|
movdqa `16*($k+3)-128`($bp),%xmm3
|
||||||
|
pand `16*($k+1)+112`(%r10),%xmm5
|
||||||
|
por %xmm4,%xmm0
|
||||||
|
pand `16*($k+2)+112`(%r10),%xmm2
|
||||||
|
por %xmm5,%xmm1
|
||||||
|
pand `16*($k+3)+112`(%r10),%xmm3
|
||||||
|
por %xmm2,%xmm0
|
||||||
|
por %xmm3,%xmm1
|
||||||
|
___
|
||||||
|
}
|
||||||
|
$code.=<<___;
|
||||||
|
por %xmm1,%xmm0
|
||||||
pshufd \$0x4e,%xmm0,%xmm1
|
pshufd \$0x4e,%xmm0,%xmm1
|
||||||
por %xmm1,%xmm0 # merge upper and lower halves
|
por %xmm1,%xmm0
|
||||||
|
lea $STRIDE($bp),$bp
|
||||||
movq %xmm0,$m0 # m0=bp[0]
|
movq %xmm0,$m0 # m0=bp[0]
|
||||||
|
|
||||||
mov ($n0),$n0 # pull n0[0] value
|
mov ($n0),$n0 # pull n0[0] value
|
||||||
@ -232,15 +250,14 @@ $code.=<<___;
|
|||||||
|
|
||||||
mulq $m1 # np[j]*m1
|
mulq $m1 # np[j]*m1
|
||||||
cmp $num,$j
|
cmp $num,$j
|
||||||
jne .L1st
|
jne .L1st # note that upon exit $j==$num, so
|
||||||
|
# they can be used interchangeably
|
||||||
movq %xmm8,$j
|
|
||||||
|
|
||||||
add %rax,$hi1
|
add %rax,$hi1
|
||||||
adc \$0,%rdx
|
adc \$0,%rdx
|
||||||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||||||
adc \$0,%rdx
|
adc \$0,%rdx
|
||||||
mov $hi1,-16(%rsp,$num,8) # tp[j-1]
|
mov $hi1,-16(%rsp,$num,8) # tp[num-1]
|
||||||
mov %rdx,$hi1
|
mov %rdx,$hi1
|
||||||
mov $lo0,$hi0
|
mov $lo0,$hi0
|
||||||
|
|
||||||
@ -254,27 +271,32 @@ $code.=<<___;
|
|||||||
jmp .Louter
|
jmp .Louter
|
||||||
.align 16
|
.align 16
|
||||||
.Louter:
|
.Louter:
|
||||||
lea 16($j),$m0
|
lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization)
|
||||||
lea 32($j),$m1
|
and \$-16,%rdx
|
||||||
and \$63,$m0
|
pxor %xmm4,%xmm4
|
||||||
lea 48($j),%rdx
|
pxor %xmm5,%xmm5
|
||||||
and \$63,$m1
|
___
|
||||||
and \$63,%rdx
|
for($k=0;$k<$STRIDE/16;$k+=4) {
|
||||||
movdqa `0*$STRIDE/4-128`($bp,$j),%xmm0
|
$code.=<<___;
|
||||||
movdqa `1*$STRIDE/4-128`($bp,$m0),%xmm1
|
movdqa `16*($k+0)-128`($bp),%xmm0
|
||||||
movdqa `2*$STRIDE/4-128`($bp,$m1),%xmm2
|
movdqa `16*($k+1)-128`($bp),%xmm1
|
||||||
movdqa `3*$STRIDE/4-128`($bp,%rdx),%xmm3
|
movdqa `16*($k+2)-128`($bp),%xmm2
|
||||||
pand %xmm4,%xmm0
|
movdqa `16*($k+3)-128`($bp),%xmm3
|
||||||
pand %xmm5,%xmm1
|
pand `16*($k+0)-128`(%rdx),%xmm0
|
||||||
pand %xmm6,%xmm2
|
pand `16*($k+1)-128`(%rdx),%xmm1
|
||||||
por %xmm1,%xmm0
|
por %xmm0,%xmm4
|
||||||
pand %xmm7,%xmm3
|
pand `16*($k+2)-128`(%rdx),%xmm2
|
||||||
por %xmm2,%xmm0
|
por %xmm1,%xmm5
|
||||||
|
pand `16*($k+3)-128`(%rdx),%xmm3
|
||||||
|
por %xmm2,%xmm4
|
||||||
|
por %xmm3,%xmm5
|
||||||
|
___
|
||||||
|
}
|
||||||
|
$code.=<<___;
|
||||||
|
por %xmm5,%xmm4
|
||||||
|
pshufd \$0x4e,%xmm4,%xmm0
|
||||||
|
por %xmm4,%xmm0
|
||||||
lea $STRIDE($bp),$bp
|
lea $STRIDE($bp),$bp
|
||||||
por %xmm3,%xmm0
|
|
||||||
|
|
||||||
pshufd \$0x4e,%xmm0,%xmm1
|
|
||||||
por %xmm1,%xmm0 # merge upper and lower halves
|
|
||||||
|
|
||||||
mov ($ap),%rax # ap[0]
|
mov ($ap),%rax # ap[0]
|
||||||
movq %xmm0,$m0 # m0=bp[i]
|
movq %xmm0,$m0 # m0=bp[i]
|
||||||
@ -324,16 +346,14 @@ $code.=<<___;
|
|||||||
|
|
||||||
mulq $m1 # np[j]*m1
|
mulq $m1 # np[j]*m1
|
||||||
cmp $num,$j
|
cmp $num,$j
|
||||||
jne .Linner
|
jne .Linner # note that upon exit $j==$num, so
|
||||||
|
# they can be used interchangeably
|
||||||
movq %xmm8,$j
|
|
||||||
|
|
||||||
add %rax,$hi1
|
add %rax,$hi1
|
||||||
adc \$0,%rdx
|
adc \$0,%rdx
|
||||||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||||
mov (%rsp,$num,8),$lo0
|
mov (%rsp,$num,8),$lo0
|
||||||
adc \$0,%rdx
|
adc \$0,%rdx
|
||||||
mov $hi1,-16(%rsp,$num,8) # tp[j-1]
|
mov $hi1,-16(%rsp,$num,8) # tp[num-1]
|
||||||
mov %rdx,$hi1
|
mov %rdx,$hi1
|
||||||
|
|
||||||
xor %rdx,%rdx
|
xor %rdx,%rdx
|
||||||
@ -380,13 +400,7 @@ $code.=<<___;
|
|||||||
|
|
||||||
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
||||||
mov \$1,%rax
|
mov \$1,%rax
|
||||||
___
|
|
||||||
$code.=<<___ if ($win64);
|
|
||||||
movaps -104(%rsi),%xmm6
|
|
||||||
movaps -88(%rsi),%xmm7
|
|
||||||
movaps -72(%rsi),%xmm8
|
|
||||||
___
|
|
||||||
$code.=<<___;
|
|
||||||
mov -48(%rsi),%r15
|
mov -48(%rsi),%r15
|
||||||
mov -40(%rsi),%r14
|
mov -40(%rsi),%r14
|
||||||
mov -32(%rsi),%r13
|
mov -32(%rsi),%r13
|
||||||
@ -1065,10 +1079,15 @@ $code.=<<___;
|
|||||||
movq $bptr,%xmm4
|
movq $bptr,%xmm4
|
||||||
|
|
||||||
call __bn_sqr8x_internal
|
call __bn_sqr8x_internal
|
||||||
|
call __bn_post4x_internal
|
||||||
call __bn_sqr8x_internal
|
call __bn_sqr8x_internal
|
||||||
|
call __bn_post4x_internal
|
||||||
call __bn_sqr8x_internal
|
call __bn_sqr8x_internal
|
||||||
|
call __bn_post4x_internal
|
||||||
call __bn_sqr8x_internal
|
call __bn_sqr8x_internal
|
||||||
|
call __bn_post4x_internal
|
||||||
call __bn_sqr8x_internal
|
call __bn_sqr8x_internal
|
||||||
|
call __bn_post4x_internal
|
||||||
|
|
||||||
movq %xmm2,$nptr
|
movq %xmm2,$nptr
|
||||||
movq %xmm4,$bptr
|
movq %xmm4,$bptr
|
||||||
@ -1629,7 +1648,7 @@ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
|
|||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
movq %xmm2,$nptr
|
movq %xmm2,$nptr
|
||||||
sqr8x_reduction:
|
__bn_sqr8x_reduction:
|
||||||
xor %rax,%rax
|
xor %rax,%rax
|
||||||
lea ($nptr,$num),%rcx # end of n[]
|
lea ($nptr,$num),%rcx # end of n[]
|
||||||
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
|
lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
|
||||||
@ -1888,6 +1907,8 @@ sqr8x_reduction:
|
|||||||
|
|
||||||
cmp %rdx,$tptr # end of t[]?
|
cmp %rdx,$tptr # end of t[]?
|
||||||
jb .L8x_reduction_loop
|
jb .L8x_reduction_loop
|
||||||
|
ret
|
||||||
|
.size bn_sqr8x_internal,.-bn_sqr8x_internal
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
##############################################################
|
##############################################################
|
||||||
@ -1896,13 +1917,12 @@ ___
|
|||||||
{
|
{
|
||||||
my ($tptr,$nptr)=("%rbx","%rbp");
|
my ($tptr,$nptr)=("%rbx","%rbp");
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
#xor %rsi,%rsi # %rsi was $carry above
|
.type __bn_post4x_internal,\@abi-omnipotent
|
||||||
|
.align 32
|
||||||
|
__bn_post4x_internal:
|
||||||
mov 8*0($nptr),%r12
|
mov 8*0($nptr),%r12
|
||||||
sub %r15,%rcx # compare top-most words
|
|
||||||
lea (%rdi,$num),$tptr # %rdi was $tptr above
|
lea (%rdi,$num),$tptr # %rdi was $tptr above
|
||||||
adc %rsi,%rsi
|
|
||||||
mov $num,%rcx
|
mov $num,%rcx
|
||||||
or %rsi,%rax
|
|
||||||
movq %xmm1,$rptr # restore $rptr
|
movq %xmm1,$rptr # restore $rptr
|
||||||
neg %rax
|
neg %rax
|
||||||
movq %xmm1,$aptr # prepare for back-to-back call
|
movq %xmm1,$aptr # prepare for back-to-back call
|
||||||
@ -1946,14 +1966,13 @@ $code.=<<___;
|
|||||||
|
|
||||||
inc %rcx # pass %cf
|
inc %rcx # pass %cf
|
||||||
jnz .Lsqr4x_sub
|
jnz .Lsqr4x_sub
|
||||||
___
|
|
||||||
}
|
|
||||||
$code.=<<___;
|
|
||||||
mov $num,%r10 # prepare for back-to-back call
|
mov $num,%r10 # prepare for back-to-back call
|
||||||
neg $num # restore $num
|
neg $num # restore $num
|
||||||
ret
|
ret
|
||||||
.size bn_sqr8x_internal,.-bn_sqr8x_internal
|
.size __bn_post4x_internal,.-__bn_post4x_internal
|
||||||
___
|
___
|
||||||
|
}
|
||||||
{
|
{
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.globl bn_from_montgomery
|
.globl bn_from_montgomery
|
||||||
@ -2061,7 +2080,8 @@ $code.=<<___ if ($addx);
|
|||||||
jne .Lfrom_mont_nox
|
jne .Lfrom_mont_nox
|
||||||
|
|
||||||
lea (%rax,$num),$rptr
|
lea (%rax,$num),$rptr
|
||||||
call sqrx8x_reduction
|
call __bn_sqrx8x_reduction
|
||||||
|
call __bn_postx4x_internal
|
||||||
|
|
||||||
pxor %xmm0,%xmm0
|
pxor %xmm0,%xmm0
|
||||||
lea 48(%rsp),%rax
|
lea 48(%rsp),%rax
|
||||||
@ -2072,7 +2092,8 @@ $code.=<<___ if ($addx);
|
|||||||
.Lfrom_mont_nox:
|
.Lfrom_mont_nox:
|
||||||
___
|
___
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
call sqr8x_reduction
|
call __bn_sqr8x_reduction
|
||||||
|
call __bn_post4x_internal
|
||||||
|
|
||||||
pxor %xmm0,%xmm0
|
pxor %xmm0,%xmm0
|
||||||
lea 48(%rsp),%rax
|
lea 48(%rsp),%rax
|
||||||
@ -2622,10 +2643,15 @@ bn_powerx5:
|
|||||||
.Lpowerx5_body:
|
.Lpowerx5_body:
|
||||||
|
|
||||||
call __bn_sqrx8x_internal
|
call __bn_sqrx8x_internal
|
||||||
|
call __bn_postx4x_internal
|
||||||
call __bn_sqrx8x_internal
|
call __bn_sqrx8x_internal
|
||||||
|
call __bn_postx4x_internal
|
||||||
call __bn_sqrx8x_internal
|
call __bn_sqrx8x_internal
|
||||||
|
call __bn_postx4x_internal
|
||||||
call __bn_sqrx8x_internal
|
call __bn_sqrx8x_internal
|
||||||
|
call __bn_postx4x_internal
|
||||||
call __bn_sqrx8x_internal
|
call __bn_sqrx8x_internal
|
||||||
|
call __bn_postx4x_internal
|
||||||
|
|
||||||
mov %r10,$num # -num
|
mov %r10,$num # -num
|
||||||
mov $aptr,$rptr
|
mov $aptr,$rptr
|
||||||
@ -3071,7 +3097,7 @@ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
|
|||||||
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
movq %xmm2,$nptr
|
movq %xmm2,$nptr
|
||||||
sqrx8x_reduction:
|
__bn_sqrx8x_reduction:
|
||||||
xor %eax,%eax # initial top-most carry bit
|
xor %eax,%eax # initial top-most carry bit
|
||||||
mov 32+8(%rsp),%rbx # n0
|
mov 32+8(%rsp),%rbx # n0
|
||||||
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
|
mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
|
||||||
@ -3279,6 +3305,8 @@ sqrx8x_reduction:
|
|||||||
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
|
lea 8*8($tptr,%rcx),$tptr # start of current t[] window
|
||||||
cmp 8+8(%rsp),%r8 # end of t[]?
|
cmp 8+8(%rsp),%r8 # end of t[]?
|
||||||
jb .Lsqrx8x_reduction_loop
|
jb .Lsqrx8x_reduction_loop
|
||||||
|
ret
|
||||||
|
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
##############################################################
|
##############################################################
|
||||||
@ -3286,15 +3314,11 @@ ___
|
|||||||
#
|
#
|
||||||
{
|
{
|
||||||
my ($rptr,$nptr)=("%rdx","%rbp");
|
my ($rptr,$nptr)=("%rdx","%rbp");
|
||||||
my @ri=map("%r$_",(10..13));
|
|
||||||
my @ni=map("%r$_",(14..15));
|
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
|
.align 32
|
||||||
|
__bn_postx4x_internal:
|
||||||
mov 8*0($nptr),%r12
|
mov 8*0($nptr),%r12
|
||||||
xor %ebx,%ebx
|
|
||||||
sub %r15,%rsi # compare top-most words
|
|
||||||
adc %rbx,%rbx
|
|
||||||
mov %rcx,%r10 # -$num
|
mov %rcx,%r10 # -$num
|
||||||
or %rbx,%rax
|
|
||||||
mov %rcx,%r9 # -$num
|
mov %rcx,%r9 # -$num
|
||||||
neg %rax
|
neg %rax
|
||||||
sar \$3+2,%rcx
|
sar \$3+2,%rcx
|
||||||
@ -3308,6 +3332,7 @@ $code.=<<___;
|
|||||||
mov 8*3($nptr),%r15
|
mov 8*3($nptr),%r15
|
||||||
jmp .Lsqrx4x_sub_entry
|
jmp .Lsqrx4x_sub_entry
|
||||||
|
|
||||||
|
.align 16
|
||||||
.Lsqrx4x_sub:
|
.Lsqrx4x_sub:
|
||||||
mov 8*0($nptr),%r12
|
mov 8*0($nptr),%r12
|
||||||
mov 8*1($nptr),%r13
|
mov 8*1($nptr),%r13
|
||||||
@ -3335,14 +3360,13 @@ $code.=<<___;
|
|||||||
|
|
||||||
inc %rcx
|
inc %rcx
|
||||||
jnz .Lsqrx4x_sub
|
jnz .Lsqrx4x_sub
|
||||||
___
|
|
||||||
}
|
|
||||||
$code.=<<___;
|
|
||||||
neg %r9 # restore $num
|
neg %r9 # restore $num
|
||||||
|
|
||||||
ret
|
ret
|
||||||
.size bn_sqrx8x_internal,.-bn_sqrx8x_internal
|
.size __bn_postx4x_internal,.-__bn_postx4x_internal
|
||||||
___
|
___
|
||||||
|
}
|
||||||
}}}
|
}}}
|
||||||
{
|
{
|
||||||
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
|
my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
|
||||||
@ -3483,9 +3507,6 @@ ___
|
|||||||
}
|
}
|
||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.align 64
|
.align 64
|
||||||
.Lmagic_masks:
|
|
||||||
.long 0x00,0x00,0x01,0x01, 0x08,0x08,0x09,0x09
|
|
||||||
.long 0x10,0x10,0x11,0x11, 0x18,0x18,0x19,0x19
|
|
||||||
.Linc:
|
.Linc:
|
||||||
.long 0,0, 1,1
|
.long 0,0, 1,1
|
||||||
.long 2,2, 2,2
|
.long 2,2, 2,2
|
||||||
@ -3541,13 +3562,6 @@ mul_handler:
|
|||||||
mov 192($context),%r10 # pull $num
|
mov 192($context),%r10 # pull $num
|
||||||
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
||||||
|
|
||||||
movaps -104(%rax),%xmm0
|
|
||||||
movaps -88(%rax),%xmm1
|
|
||||||
movaps -72(%rax),%xmm2
|
|
||||||
|
|
||||||
movups %xmm0,512($context) # restore context->Xmm6
|
|
||||||
movups %xmm1,528($context) # restore context->Xmm7
|
|
||||||
movups %xmm2,544($context) # restore context->Xmm8
|
|
||||||
jmp .Lbody_proceed
|
jmp .Lbody_proceed
|
||||||
|
|
||||||
.Lbody_40:
|
.Lbody_40:
|
||||||
@ -3675,8 +3689,9 @@ ___
|
|||||||
$code.=<<___;
|
$code.=<<___;
|
||||||
.align 8
|
.align 8
|
||||||
.LSEH_info_bn_gather5:
|
.LSEH_info_bn_gather5:
|
||||||
.byte 0x01,0x0b,0x02,0x00
|
.byte 0x01,0x0b,0x03,0x0a
|
||||||
.byte 0x0b,0x01,0x21,0x00 #sub rsp,0x108
|
.byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108
|
||||||
|
.byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp)
|
||||||
.align 8
|
.align 8
|
||||||
___
|
___
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user