+20% tune-up for Power5.
This commit is contained in:
parent
6c69aa532e
commit
8ea975d070
@ -162,17 +162,16 @@ $code=<<___;
|
|||||||
.align 4
|
.align 4
|
||||||
L1st:
|
L1st:
|
||||||
$LDX $aj,$ap,$j ; ap[j]
|
$LDX $aj,$ap,$j ; ap[j]
|
||||||
$LDX $nj,$np,$j ; np[j]
|
|
||||||
addc $lo0,$alo,$hi0
|
addc $lo0,$alo,$hi0
|
||||||
|
$LDX $nj,$np,$j ; np[j]
|
||||||
addze $hi0,$ahi
|
addze $hi0,$ahi
|
||||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
|
$UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
|
||||||
$UMULH $ahi,$aj,$m0
|
|
||||||
|
|
||||||
addc $lo1,$nlo,$hi1
|
addc $lo1,$nlo,$hi1
|
||||||
|
$UMULH $ahi,$aj,$m0
|
||||||
addze $hi1,$nhi
|
addze $hi1,$nhi
|
||||||
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||||
$UMULH $nhi,$nj,$m1
|
|
||||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
|
||||||
|
$UMULH $nhi,$nj,$m1
|
||||||
addze $hi1,$hi1
|
addze $hi1,$hi1
|
||||||
$ST $lo1,0($tp) ; tp[j-1]
|
$ST $lo1,0($tp) ; tp[j-1]
|
||||||
|
|
||||||
@ -206,20 +205,16 @@ Louter:
|
|||||||
$LD $aj,$BNSZ($ap) ; ap[1]
|
$LD $aj,$BNSZ($ap) ; ap[1]
|
||||||
$LD $nj,0($np) ; np[0]
|
$LD $nj,0($np) ; np[0]
|
||||||
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
|
addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
|
||||||
addze $hi0,$hi0
|
|
||||||
|
|
||||||
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
|
|
||||||
|
|
||||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||||
|
addze $hi0,$hi0
|
||||||
|
$UMULL $m1,$lo0,$n0 ; tp[0]*n0
|
||||||
$UMULH $ahi,$aj,$m0
|
$UMULH $ahi,$aj,$m0
|
||||||
|
|
||||||
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
$UMULL $lo1,$nj,$m1 ; np[0]*m1
|
||||||
$UMULH $hi1,$nj,$m1
|
$UMULH $hi1,$nj,$m1
|
||||||
$LD $nj,$BNSZ($np) ; np[1]
|
$LD $nj,$BNSZ($np) ; np[1]
|
||||||
addc $lo1,$lo1,$lo0
|
addc $lo1,$lo1,$lo0
|
||||||
addze $hi1,$hi1
|
|
||||||
|
|
||||||
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
$UMULL $nlo,$nj,$m1 ; np[1]*m1
|
||||||
|
addze $hi1,$hi1
|
||||||
$UMULH $nhi,$nj,$m1
|
$UMULH $nhi,$nj,$m1
|
||||||
|
|
||||||
mtctr $num
|
mtctr $num
|
||||||
@ -227,24 +222,22 @@ Louter:
|
|||||||
.align 4
|
.align 4
|
||||||
Linner:
|
Linner:
|
||||||
$LDX $aj,$ap,$j ; ap[j]
|
$LDX $aj,$ap,$j ; ap[j]
|
||||||
$LD $tj,$BNSZ($tp) ; tp[j]
|
|
||||||
addc $lo0,$alo,$hi0
|
addc $lo0,$alo,$hi0
|
||||||
|
$LD $tj,$BNSZ($tp) ; tp[j]
|
||||||
addze $hi0,$ahi
|
addze $hi0,$ahi
|
||||||
$LDX $nj,$np,$j ; np[j]
|
$LDX $nj,$np,$j ; np[j]
|
||||||
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
|
||||||
addze $hi0,$hi0
|
|
||||||
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
|
||||||
$UMULH $ahi,$aj,$m0
|
|
||||||
|
|
||||||
addc $lo1,$nlo,$hi1
|
addc $lo1,$nlo,$hi1
|
||||||
|
$UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
|
||||||
addze $hi1,$nhi
|
addze $hi1,$nhi
|
||||||
|
$UMULH $ahi,$aj,$m0
|
||||||
|
addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
|
||||||
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
$UMULL $nlo,$nj,$m1 ; np[j]*m1
|
||||||
|
addze $hi0,$hi0
|
||||||
$UMULH $nhi,$nj,$m1
|
$UMULH $nhi,$nj,$m1
|
||||||
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
|
||||||
|
addi $j,$j,$BNSZ ; j++
|
||||||
addze $hi1,$hi1
|
addze $hi1,$hi1
|
||||||
$ST $lo1,0($tp) ; tp[j-1]
|
$ST $lo1,0($tp) ; tp[j-1]
|
||||||
|
|
||||||
addi $j,$j,$BNSZ ; j++
|
|
||||||
addi $tp,$tp,$BNSZ ; tp++
|
addi $tp,$tp,$BNSZ ; tp++
|
||||||
bdnz- Linner
|
bdnz- Linner
|
||||||
;Linner
|
;Linner
|
||||||
|
Loading…
x
Reference in New Issue
Block a user