This is "informational" commit. Its mere purpose is to expose "modulo
factor" in inner loops.
This commit is contained in:
parent
48d2335d73
commit
1c3d2b94be
@ -286,19 +286,16 @@ $fname:
|
|||||||
!or %o7,%o0,%o0 ! 64-bit result
|
!or %o7,%o0,%o0 ! 64-bit result
|
||||||
srlx %o3,16,%g1 ! 34-bit carry
|
srlx %o3,16,%g1 ! 34-bit carry
|
||||||
|
|
||||||
ba .L1st
|
|
||||||
add $j,8,$j
|
add $j,8,$j
|
||||||
.align 32
|
add $ap,$j,%o4
|
||||||
.L1st:
|
add $np,$j,%o5
|
||||||
add $ap,$j,%o3
|
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||||
add $np,$j,%o4
|
|
||||||
ld [%o3+0],$alo_ ! load a[j] as pair of 32-bit words
|
|
||||||
fzeros $alo
|
fzeros $alo
|
||||||
ld [%o3+4],$ahi_
|
ld [%o4+4],$ahi_
|
||||||
fzeros $ahi
|
fzeros $ahi
|
||||||
ld [%o4+0],$nlo_ ! load n[j] as pair of 32-bit words
|
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||||
fzeros $nlo
|
fzeros $nlo
|
||||||
ld [%o4+4],$nhi_
|
ld [%o5+4],$nhi_
|
||||||
fzeros $nhi
|
fzeros $nhi
|
||||||
|
|
||||||
fxtod $alo,$alo
|
fxtod $alo,$alo
|
||||||
@ -350,6 +347,11 @@ $fname:
|
|||||||
std $nlob,[%sp+$bias+$frame+8]
|
std $nlob,[%sp+$bias+$frame+8]
|
||||||
std $nloc,[%sp+$bias+$frame+16]
|
std $nloc,[%sp+$bias+$frame+16]
|
||||||
std $nlod,[%sp+$bias+$frame+24]
|
std $nlod,[%sp+$bias+$frame+24]
|
||||||
|
|
||||||
|
addcc $j,8,$j
|
||||||
|
bz,pn %icc,.L1stskip
|
||||||
|
.align 32,0x1000000
|
||||||
|
.L1st:
|
||||||
ldx [%sp+$bias+$frame+0],%o0
|
ldx [%sp+$bias+$frame+0],%o0
|
||||||
ldx [%sp+$bias+$frame+8],%o1
|
ldx [%sp+$bias+$frame+8],%o1
|
||||||
ldx [%sp+$bias+$frame+16],%o2
|
ldx [%sp+$bias+$frame+16],%o2
|
||||||
@ -376,9 +378,101 @@ $fname:
|
|||||||
add %g1,1,%g1
|
add %g1,1,%g1
|
||||||
|
|
||||||
stx %o0,[$tp] ! tp[j-1]=
|
stx %o0,[$tp] ! tp[j-1]=
|
||||||
|
|
||||||
|
|
||||||
|
add $ap,$j,%o4
|
||||||
|
add $np,$j,%o5
|
||||||
|
ld [%o4+0],$alo_ ! load a[j] as pair of 32-bit words
|
||||||
|
fzeros $alo
|
||||||
|
ld [%o4+4],$ahi_
|
||||||
|
fzeros $ahi
|
||||||
|
ld [%o5+0],$nlo_ ! load n[j] as pair of 32-bit words
|
||||||
|
fzeros $nlo
|
||||||
|
ld [%o5+4],$nhi_
|
||||||
|
fzeros $nhi
|
||||||
|
|
||||||
|
fxtod $alo,$alo
|
||||||
|
fxtod $ahi,$ahi
|
||||||
|
fxtod $nlo,$nlo
|
||||||
|
fxtod $nhi,$nhi
|
||||||
|
|
||||||
|
std $alo,[$ap_l+$j] ! save smashed ap[j] in double format
|
||||||
|
fmuld $alo,$ba,$aloa
|
||||||
|
std $ahi,[$ap_h+$j]
|
||||||
|
fmuld $nlo,$na,$nloa
|
||||||
|
std $nlo,[$np_l+$j] ! save smashed np[j] in double format
|
||||||
|
fmuld $alo,$bb,$alob
|
||||||
|
std $nhi,[$np_h+$j]
|
||||||
|
fmuld $nlo,$nb,$nlob
|
||||||
|
fmuld $alo,$bc,$aloc
|
||||||
|
faddd $aloa,$nloa,$nloa
|
||||||
|
fmuld $nlo,$nc,$nloc
|
||||||
|
fmuld $alo,$bd,$alod
|
||||||
|
faddd $alob,$nlob,$nlob
|
||||||
|
fmuld $nlo,$nd,$nlod
|
||||||
|
fmuld $ahi,$ba,$ahia
|
||||||
|
faddd $aloc,$nloc,$nloc
|
||||||
|
fmuld $nhi,$na,$nhia
|
||||||
|
fmuld $ahi,$bb,$ahib
|
||||||
|
faddd $alod,$nlod,$nlod
|
||||||
|
fmuld $nhi,$nb,$nhib
|
||||||
|
fmuld $ahi,$bc,$ahic
|
||||||
|
faddd $ahia,$nhia,$nhia
|
||||||
|
fmuld $nhi,$nc,$nhic
|
||||||
|
fmuld $ahi,$bd,$ahid
|
||||||
|
faddd $ahib,$nhib,$nhib
|
||||||
|
fmuld $nhi,$nd,$nhid
|
||||||
|
|
||||||
|
faddd $dota,$nloa,$nloa
|
||||||
|
faddd $dotb,$nlob,$nlob
|
||||||
|
faddd $ahic,$nhic,$dota ! $nhic
|
||||||
|
faddd $ahid,$nhid,$dotb ! $nhid
|
||||||
|
|
||||||
|
faddd $nloc,$nhia,$nloc
|
||||||
|
faddd $nlod,$nhib,$nlod
|
||||||
|
|
||||||
|
fdtox $nloa,$nloa
|
||||||
|
fdtox $nlob,$nlob
|
||||||
|
fdtox $nloc,$nloc
|
||||||
|
fdtox $nlod,$nlod
|
||||||
|
|
||||||
|
std $nloa,[%sp+$bias+$frame+0]
|
||||||
|
std $nlob,[%sp+$bias+$frame+8]
|
||||||
|
std $nloc,[%sp+$bias+$frame+16]
|
||||||
|
std $nlod,[%sp+$bias+$frame+24]
|
||||||
|
|
||||||
addcc $j,8,$j
|
addcc $j,8,$j
|
||||||
bnz,pt %icc,.L1st
|
bnz,pt %icc,.L1st
|
||||||
add $tp,8,$tp
|
add $tp,8,$tp
|
||||||
|
|
||||||
|
.L1stskip:
|
||||||
|
ldx [%sp+$bias+$frame+0],%o0
|
||||||
|
ldx [%sp+$bias+$frame+8],%o1
|
||||||
|
ldx [%sp+$bias+$frame+16],%o2
|
||||||
|
ldx [%sp+$bias+$frame+24],%o3
|
||||||
|
|
||||||
|
srlx %o0,16,%o7
|
||||||
|
add %o7,%o1,%o1
|
||||||
|
srlx %o1,16,%o7
|
||||||
|
add %o7,%o2,%o2
|
||||||
|
srlx %o2,16,%o7
|
||||||
|
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||||
|
and %o0,$mask,%o0
|
||||||
|
and %o1,$mask,%o1
|
||||||
|
and %o2,$mask,%o2
|
||||||
|
sllx %o1,16,%o1
|
||||||
|
sllx %o2,32,%o2
|
||||||
|
sllx %o3,48,%o7
|
||||||
|
or %o1,%o0,%o0
|
||||||
|
or %o2,%o0,%o0
|
||||||
|
or %o7,%o0,%o0 ! 64-bit result
|
||||||
|
addcc %g1,%o0,%o0
|
||||||
|
srlx %o3,16,%g1 ! 34-bit carry
|
||||||
|
bcs,a %xcc,.+8
|
||||||
|
add %g1,1,%g1
|
||||||
|
|
||||||
|
stx %o0,[$tp] ! tp[j-1]=
|
||||||
|
add $tp,8,$tp
|
||||||
|
|
||||||
fdtox $dota,$dota
|
fdtox $dota,$dota
|
||||||
fdtox $dotb,$dotb
|
fdtox $dotb,$dotb
|
||||||
@ -514,10 +608,7 @@ $fname:
|
|||||||
bcs,a %xcc,.+8
|
bcs,a %xcc,.+8
|
||||||
add %g1,1,%g1
|
add %g1,1,%g1
|
||||||
|
|
||||||
ba .Linner
|
|
||||||
add $j,8,$j
|
add $j,8,$j
|
||||||
.align 32
|
|
||||||
.Linner:
|
|
||||||
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||||
ldd [$ap_h+$j],$ahi
|
ldd [$ap_h+$j],$ahi
|
||||||
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||||
@ -563,6 +654,11 @@ $fname:
|
|||||||
std $nlob,[%sp+$bias+$frame+8]
|
std $nlob,[%sp+$bias+$frame+8]
|
||||||
std $nloc,[%sp+$bias+$frame+16]
|
std $nloc,[%sp+$bias+$frame+16]
|
||||||
std $nlod,[%sp+$bias+$frame+24]
|
std $nlod,[%sp+$bias+$frame+24]
|
||||||
|
|
||||||
|
addcc $j,8,$j
|
||||||
|
bz,pn %icc,.Linnerskip
|
||||||
|
.align 32,0x1000000
|
||||||
|
.Linner:
|
||||||
ldx [%sp+$bias+$frame+0],%o0
|
ldx [%sp+$bias+$frame+0],%o0
|
||||||
ldx [%sp+$bias+$frame+8],%o1
|
ldx [%sp+$bias+$frame+8],%o1
|
||||||
ldx [%sp+$bias+$frame+16],%o2
|
ldx [%sp+$bias+$frame+16],%o2
|
||||||
@ -594,9 +690,91 @@ $fname:
|
|||||||
add %g1,1,%g1
|
add %g1,1,%g1
|
||||||
|
|
||||||
stx %o0,[$tp] ! tp[j-1]
|
stx %o0,[$tp] ! tp[j-1]
|
||||||
|
|
||||||
|
|
||||||
|
ldd [$ap_l+$j],$alo ! load a[j] in double format
|
||||||
|
ldd [$ap_h+$j],$ahi
|
||||||
|
ldd [$np_l+$j],$nlo ! load n[j] in double format
|
||||||
|
ldd [$np_h+$j],$nhi
|
||||||
|
|
||||||
|
fmuld $alo,$ba,$aloa
|
||||||
|
fmuld $nlo,$na,$nloa
|
||||||
|
fmuld $alo,$bb,$alob
|
||||||
|
fmuld $nlo,$nb,$nlob
|
||||||
|
fmuld $alo,$bc,$aloc
|
||||||
|
faddd $aloa,$nloa,$nloa
|
||||||
|
fmuld $nlo,$nc,$nloc
|
||||||
|
fmuld $alo,$bd,$alod
|
||||||
|
faddd $alob,$nlob,$nlob
|
||||||
|
fmuld $nlo,$nd,$nlod
|
||||||
|
fmuld $ahi,$ba,$ahia
|
||||||
|
faddd $aloc,$nloc,$nloc
|
||||||
|
fmuld $nhi,$na,$nhia
|
||||||
|
fmuld $ahi,$bb,$ahib
|
||||||
|
faddd $alod,$nlod,$nlod
|
||||||
|
fmuld $nhi,$nb,$nhib
|
||||||
|
fmuld $ahi,$bc,$ahic
|
||||||
|
faddd $ahia,$nhia,$nhia
|
||||||
|
fmuld $nhi,$nc,$nhic
|
||||||
|
fmuld $ahi,$bd,$ahid
|
||||||
|
faddd $ahib,$nhib,$nhib
|
||||||
|
fmuld $nhi,$nd,$nhid
|
||||||
|
|
||||||
|
faddd $dota,$nloa,$nloa
|
||||||
|
faddd $dotb,$nlob,$nlob
|
||||||
|
faddd $ahic,$nhic,$dota ! $nhic
|
||||||
|
faddd $ahid,$nhid,$dotb ! $nhid
|
||||||
|
|
||||||
|
faddd $nloc,$nhia,$nloc
|
||||||
|
faddd $nlod,$nhib,$nlod
|
||||||
|
|
||||||
|
fdtox $nloa,$nloa
|
||||||
|
fdtox $nlob,$nlob
|
||||||
|
fdtox $nloc,$nloc
|
||||||
|
fdtox $nlod,$nlod
|
||||||
|
|
||||||
|
std $nloa,[%sp+$bias+$frame+0]
|
||||||
|
std $nlob,[%sp+$bias+$frame+8]
|
||||||
|
std $nloc,[%sp+$bias+$frame+16]
|
||||||
|
std $nlod,[%sp+$bias+$frame+24]
|
||||||
|
|
||||||
addcc $j,8,$j
|
addcc $j,8,$j
|
||||||
bnz,pt %icc,.Linner
|
bnz,pt %icc,.Linner
|
||||||
add $tp,8,$tp
|
add $tp,8,$tp
|
||||||
|
|
||||||
|
.Linnerskip:
|
||||||
|
ldx [%sp+$bias+$frame+0],%o0
|
||||||
|
ldx [%sp+$bias+$frame+8],%o1
|
||||||
|
ldx [%sp+$bias+$frame+16],%o2
|
||||||
|
ldx [%sp+$bias+$frame+24],%o3
|
||||||
|
|
||||||
|
srlx %o0,16,%o7
|
||||||
|
add %o7,%o1,%o1
|
||||||
|
srlx %o1,16,%o7
|
||||||
|
add %o7,%o2,%o2
|
||||||
|
srlx %o2,16,%o7
|
||||||
|
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
|
||||||
|
and %o0,$mask,%o0
|
||||||
|
and %o1,$mask,%o1
|
||||||
|
and %o2,$mask,%o2
|
||||||
|
sllx %o1,16,%o1
|
||||||
|
sllx %o2,32,%o2
|
||||||
|
sllx %o3,48,%o7
|
||||||
|
or %o1,%o0,%o0
|
||||||
|
or %o2,%o0,%o0
|
||||||
|
or %o7,%o0,%o0 ! 64-bit result
|
||||||
|
addcc %g1,%o0,%o0
|
||||||
|
srlx %o3,16,%g1 ! 34-bit carry
|
||||||
|
bcs,a %xcc,.+8
|
||||||
|
add %g1,1,%g1
|
||||||
|
|
||||||
|
ldx [$tp+8],%o7 ! tp[j]
|
||||||
|
addcc %o7,%o0,%o0
|
||||||
|
bcs,a %xcc,.+8
|
||||||
|
add %g1,1,%g1
|
||||||
|
|
||||||
|
stx %o0,[$tp] ! tp[j-1]
|
||||||
|
add $tp,8,$tp
|
||||||
|
|
||||||
fdtox $dota,$dota
|
fdtox $dota,$dota
|
||||||
fdtox $dotb,$dotb
|
fdtox $dotb,$dotb
|
||||||
|
Loading…
x
Reference in New Issue
Block a user