bn/asm/x86[_64]-mont*.pl: complement alloca with page-walking.

Some OSes, *cough*-dows, insist on stack being "wired" to
physical memory in strictly sequential manner, i.e. if stack
allocation spans two pages, then reference to farmost one can
be punishable by SEGV. But page walking can do good even on
other OSes, because it guarantees that villain thread hits
the guard page before it can make damage to innocent one...

Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
Andy Polyakov 2016-03-04 11:39:11 +01:00
parent 56cd71b46e
commit adc4f1fc25
3 changed files with 116 additions and 2 deletions

View File

@ -85,6 +85,21 @@ $frame=32; # size of above frame rounded up to 16n
&and ("esp",-64); # align to cache line &and ("esp",-64); # align to cache line
# Some OSes, *cough*-dows, insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
&mov ("eax","ebp");
&sub ("eax","esp");
&and ("eax",-4096);
&set_label("page_walk");
&mov ("edx",&DWP(0,"esp","eax"));
&sub ("eax",4096);
&data_byte(0x2e);
&jnc (&label("page_walk"));
################################# load argument block... ################################# load argument block...
&mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
&mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap

View File

@ -130,6 +130,20 @@ $code.=<<___;
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body: .Lmul_body:
# Some OSes, *cough*-dows, insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
sub %rsp,%r11
and \$-4096,%r11
.Lmul_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x66,0x2e # predict non-taken
jnc .Lmul_page_walk
mov $bp,%r12 # reassign $bp mov $bp,%r12 # reassign $bp
___ ___
$bp="%r12"; $bp="%r12";
@ -342,6 +356,14 @@ $code.=<<___;
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul4x_body: .Lmul4x_body:
sub %rsp,%r11
and \$-4096,%r11
.Lmul4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lmul4x_page_walk
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
mov %rdx,%r12 # reassign $bp mov %rdx,%r12 # reassign $bp
___ ___
@ -795,6 +817,15 @@ bn_sqr8x_mont:
sub %r11,%rsp sub %r11,%rsp
.Lsqr8x_sp_done: .Lsqr8x_sp_done:
and \$-64,%rsp and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lsqr8x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lsqr8x_page_walk
mov $num,%r10 mov $num,%r10
neg $num neg $num
@ -932,8 +963,17 @@ bn_mulx4x_mont:
sub $num,%r10 # -$num sub $num,%r10 # -$num
mov ($n0),$n0 # *n0 mov ($n0),$n0 # *n0
lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8) lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
lea ($bp,$num),%r10
and \$-128,%rsp and \$-128,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lmulx4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x66,0x2e # predict non-taken
jnc .Lmulx4x_page_walk
lea ($bp,$num),%r10
############################################################## ##############################################################
# Stack layout # Stack layout
# +0 num # +0 num

View File

@ -115,6 +115,20 @@ $code.=<<___;
mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
.Lmul_body: .Lmul_body:
# Some OSes, *cough*-dows, insist on stack being "wired" to
# physical memory in strictly sequential manner, i.e. if stack
# allocation spans two pages, then reference to farmost one can
# be punishable by SEGV. But page walking can do good even on
# other OSes, because it guarantees that villain thread hits
# the guard page before it can make damage to innocent one...
sub %rsp,%rax
and \$-4096,%rax
.Lmul_page_walk:
mov (%rsp,%rax),%r11
sub \$4096,%rax
.byte 0x2e # predict non-taken
jnc .Lmul_page_walk
lea 128($bp),%r12 # reassign $bp (+size optimization) lea 128($bp),%r12 # reassign $bp (+size optimization)
___ ___
$bp="%r12"; $bp="%r12";
@ -469,6 +483,15 @@ $code.=<<___;
sub %r11,%rsp sub %r11,%rsp
.Lmul4xsp_done: .Lmul4xsp_done:
and \$-64,%rsp and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lmul4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lmul4x_page_walk
neg $num neg $num
mov %rax,40(%rsp) mov %rax,40(%rsp)
@ -1058,6 +1081,15 @@ $code.=<<___;
sub %r11,%rsp sub %r11,%rsp
.Lpwr_sp_done: .Lpwr_sp_done:
and \$-64,%rsp and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lpwr_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lpwr_page_walk
mov $num,%r10 mov $num,%r10
neg $num neg $num
@ -2028,7 +2060,16 @@ bn_from_mont8x:
sub %r11,%rsp sub %r11,%rsp
.Lfrom_sp_done: .Lfrom_sp_done:
and \$-64,%rsp and \$-64,%rsp
mov $num,%r10 mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lfrom_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lfrom_page_walk
mov $num,%r10
neg $num neg $num
############################################################## ##############################################################
@ -2173,6 +2214,15 @@ bn_mulx4x_mont_gather5:
sub %r11,%rsp sub %r11,%rsp
.Lmulx4xsp_done: .Lmulx4xsp_done:
and \$-64,%rsp # ensure alignment and \$-64,%rsp # ensure alignment
mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lmulx4x_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lmulx4x_page_walk
############################################################## ##############################################################
# Stack layout # Stack layout
# +0 -num # +0 -num
@ -2619,6 +2669,15 @@ bn_powerx5:
sub %r11,%rsp sub %r11,%rsp
.Lpwrx_sp_done: .Lpwrx_sp_done:
and \$-64,%rsp and \$-64,%rsp
mov %rax,%r11
sub %rsp,%r11
and \$-4096,%r11
.Lpwrx_page_walk:
mov (%rsp,%r11),%r10
sub \$4096,%r11
.byte 0x2e # predict non-taken
jnc .Lpwrx_page_walk
mov $num,%r10 mov $num,%r10
neg $num neg $num