diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index e8f6b0508..89f4de61e 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -85,6 +85,21 @@ $frame=32; # size of above frame rounded up to 16n &and ("esp",-64); # align to cache line + # Some OSes, *cough*-dows, insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + &mov ("eax","ebp"); + &sub ("eax","esp"); + &and ("eax",-4096); +&set_label("page_walk"); + &mov ("edx",&DWP(0,"esp","eax")); + &sub ("eax",4096); + &data_byte(0x2e); + &jnc (&label("page_walk")); + ################################# load argument block... &mov ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp &mov ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index 17fb94c84..c8ae019da 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -91,6 +91,20 @@ bn_mul_mont: mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: + # Some OSes, *cough*-dows, insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %rsp,%r11 + and \$-4096,%r11 +.Lmul_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x66,0x2e # predict non-taken + jnc .Lmul_page_walk + mov $bp,%r12 # reassign $bp ___ $bp="%r12"; @@ -296,6 +310,14 @@ bn_mul4x_mont: mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul4x_body: + sub %rsp,%r11 + and \$-4096,%r11 +.Lmul4x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lmul4x_page_walk + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ @@ -707,6 +729,7 @@ $code.=<<___; .align 16 bn_sqr4x_mont: .Lsqr4x_enter: + mov %rsp,%rax push %rbx push %rbp push %r12 @@ -715,12 +738,23 @@ bn_sqr4x_mont: push %r15 shl \$3,${num}d # convert $num to bytes - xor %r10,%r10 mov %rsp,%r11 # put aside %rsp - sub $num,%r10 # -$num + neg $num # -$num mov ($n0),$n0 # *n0 - lea -72(%rsp,%r10,2),%rsp # alloca(frame+2*$num) + lea -72(%rsp,$num,2),%rsp # alloca(frame+2*$num) and \$-1024,%rsp # minimize TLB usage + + sub %rsp,%r11 + and \$-4096,%r11 +.Lsqr4x_page_walk: + mov (%rsp,%r11),%r10 + sub \$4096,%r11 + .byte 0x2e # predict non-taken + jnc .Lsqr4x_page_walk + + mov $num,%r10 + neg $num # restore $num + lea -48(%rax),%r11 # restore saved %rsp ############################################################## # Stack layout # diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl index 235979181..99243ae3e 100755 --- a/crypto/bn/asm/x86_64-mont5.pl +++ b/crypto/bn/asm/x86_64-mont5.pl @@ -84,6 +84,20 @@ bn_mul_mont_gather5: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul_body: + # Some OSes, *cough*-dows, insist on stack being "wired" to + # physical memory in strictly sequential manner, i.e. if stack + # allocation spans two pages, then reference to farmost one can + # be punishable by SEGV. But page walking can do good even on + # other OSes, because it guarantees that villain thread hits + # the guard page before it can make damage to innocent one... + sub %rsp,%rax + and \$-4096,%rax +.Lmul_page_walk: + mov (%rsp,%rax),%r11 + sub \$4096,%rax + .byte 0x2e # predict non-taken + jnc .Lmul_page_walk + lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; @@ -407,6 +421,14 @@ bn_mul4x_mont_gather5: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .Lmul4x_body: + sub %rsp,%rax + and \$-4096,%rax +.Lmul4x_page_walk: + mov (%rsp,%rax),%r11 + sub \$4096,%rax + .byte 0x2e # predict non-taken + jnc .Lmul4x_page_walk + mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp lea 128(%rdx),%r12 # reassign $bp (+size optimization) ___