bn/asm/s390x.S: improve performance on z196 and z13 by up to 26%. [even z10 is couple percent faster]. Triggered by RT#4128, but solves the problem by real modulo-scheduling.

Reviewed-by: Rich Salz <rsalz@openssl.org>
This commit is contained in:
Andy Polyakov 2015-11-10 21:11:24 +01:00
parent a5fd24d19b
commit 9d0e4dc635

View File

@ -18,71 +18,106 @@
.align 4
bn_mul_add_words:
lghi zero,0 // zero = 0
la %r1,0(%r2) // put rp aside
lghi %r2,0 // i=0;
la %r1,0(%r2) // put rp aside [to give way to]
lghi %r2,0 // return value
ltgfr %r4,%r4
bler %r14 // if (len<=0) return 0;
stmg %r6,%r10,48(%r15)
lghi %r10,3
lghi %r8,0 // carry = 0
nr %r10,%r4 // len%4
stmg %r6,%r13,48(%r15)
lghi %r2,3
lghi %r12,0 // carry = 0
slgr %r1,%r3 // rp-=ap
nr %r2,%r4 // len%4
sra %r4,2 // cnt=len/4
jz .Loop1_madd // carry is incidentally cleared if branch taken
algr zero,zero // clear carry
.Loop4_madd:
lg %r7,0(%r2,%r3) // ap[i]
lg %r7,0(%r3) // ap[0]
lg %r9,8(%r3) // ap[1]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
alcgr %r6,zero
alg %r7,0(%r2,%r1) // +=rp[i]
stg %r7,0(%r2,%r1) // rp[i]=
brct %r4,.Loop4_madd
j .Loop4_madd_tail
lg %r9,8(%r2,%r3)
.Loop4_madd:
mlgr %r8,%r5
lg %r11,16(%r3) // ap[i+2]
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r2,%r1)
stg %r9,8(%r2,%r1)
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
lg %r7,32(%r3)
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
lg %r7,16(%r2,%r3)
mlgr %r6,%r5
alcgr %r7,%r8
alcgr %r6,zero
alg %r7,16(%r2,%r1)
stg %r7,16(%r2,%r1)
lg %r9,40(%r3)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
lg %r9,24(%r2,%r3)
mlgr %r8,%r5
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,24(%r2,%r1)
stg %r9,24(%r2,%r1)
la %r2,32(%r2) // i+=4
la %r3,32(%r3) // i+=4
brct %r4,.Loop4_madd
la %r10,1(%r10) // see if len%4 is zero ...
brct %r10,.Loop1_madd // without touching condition code:-)
.Loop4_madd_tail:
mlgr %r8,%r5
lg %r11,16(%r3)
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
mlgr %r10,%r5
lg %r13,24(%r3)
alcgr %r9,%r6
alcgr %r8,zero
alg %r9,8(%r3,%r1)
stg %r9,8(%r3,%r1)
mlgr %r12,%r5
alcgr %r11,%r8
alcgr %r10,zero
alg %r11,16(%r3,%r1)
stg %r11,16(%r3,%r1)
alcgr %r13,%r10
alcgr %r12,zero
alg %r13,24(%r3,%r1)
stg %r13,24(%r3,%r1)
la %r3,32(%r3) // i+=4
la %r2,1(%r2) // see if len%4 is zero ...
brct %r2,.Loop1_madd // without touching condition code:-)
.Lend_madd:
alcgr %r8,zero // collect carry bit
lgr %r2,%r8
lmg %r6,%r10,48(%r15)
lgr %r2,zero // return value
alcgr %r2,%r12 // collect even carry bit
lmg %r6,%r13,48(%r15)
br %r14
.Loop1_madd:
lg %r7,0(%r2,%r3) // ap[i]
lg %r7,0(%r3) // ap[i]
mlgr %r6,%r5 // *=w
alcgr %r7,%r8 // +=carry
alcgr %r7,%r12 // +=carry
alcgr %r6,zero
alg %r7,0(%r2,%r1) // +=rp[i]
stg %r7,0(%r2,%r1) // rp[i]=
alg %r7,0(%r3,%r1) // +=rp[i]
stg %r7,0(%r3,%r1) // rp[i]=
lgr %r8,%r6
la %r2,8(%r2) // i++
brct %r10,.Loop1_madd
lgr %r12,%r6
la %r3,8(%r3) // i++
brct %r2,.Loop1_madd
j .Lend_madd
.size bn_mul_add_words,.-bn_mul_add_words