x86/sbrdsp: Use different mem moves
Before 2843 decicycles in ff_sbr_autocorrelate_sse3, 262086 runs, 58 skips After 2693 decicycles in ff_sbr_autocorrelate_sse3, 262117 runs, 27 skips Signed-off-by: James Almer <jamrial@gmail.com>
This commit is contained in:
parent
449b21bfab
commit
7aeafacfd0
@ -454,13 +454,15 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
|
|||||||
neg cntq
|
neg cntq
|
||||||
|
|
||||||
%if cpuflag(sse3)
|
%if cpuflag(sse3)
|
||||||
|
%define MOVH movsd
|
||||||
movddup m5, [xq+cntq]
|
movddup m5, [xq+cntq]
|
||||||
%else
|
%else
|
||||||
|
%define MOVH movlps
|
||||||
movlps m5, [xq+cntq]
|
movlps m5, [xq+cntq]
|
||||||
movlhps m5, m5
|
movlhps m5, m5
|
||||||
%endif
|
%endif
|
||||||
movlps m7, [xq+cntq+8 ]
|
MOVH m7, [xq+cntq+8 ]
|
||||||
movlps m1, [xq+cntq+16]
|
MOVH m1, [xq+cntq+16]
|
||||||
shufps m7, m7, q0110
|
shufps m7, m7, q0110
|
||||||
shufps m1, m1, q0110
|
shufps m1, m1, q0110
|
||||||
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
|
mulps m3, m5, m7 ; x[0][0] * x[1][0], x[0][1] * x[1][1], x[0][0] * x[1][1], x[0][1] * x[1][0]
|
||||||
@ -470,7 +472,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
|
|||||||
movaps [rsp+16], m4
|
movaps [rsp+16], m4
|
||||||
add cntq, 8
|
add cntq, 8
|
||||||
|
|
||||||
movlps m2, [xq+cntq+16]
|
MOVH m2, [xq+cntq+16]
|
||||||
movlhps m7, m7
|
movlhps m7, m7
|
||||||
shufps m2, m2, q0110
|
shufps m2, m2, q0110
|
||||||
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
|
mulps m6, m7, m1 ; real_sum1 = x[1][0] * x[2][0], x[1][1] * x[2][1]; imag_sum1 += x[1][0] * x[2][1], x[1][1] * x[2][0]
|
||||||
@ -481,7 +483,7 @@ cglobal sbr_autocorrelate, 2,3,8,32, x, phi, cnt
|
|||||||
align 16
|
align 16
|
||||||
.loop:
|
.loop:
|
||||||
add cntq, 8
|
add cntq, 8
|
||||||
movlps m0, [xq+cntq+16]
|
MOVH m0, [xq+cntq+16]
|
||||||
movlhps m1, m1
|
movlhps m1, m1
|
||||||
shufps m0, m0, q0110
|
shufps m0, m0, q0110
|
||||||
mulps m3, m1, m2
|
mulps m3, m1, m2
|
||||||
@ -491,7 +493,7 @@ align 16
|
|||||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||||
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
addps m7, m1 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||||
add cntq, 8
|
add cntq, 8
|
||||||
movlps m1, [xq+cntq+16]
|
MOVH m1, [xq+cntq+16]
|
||||||
movlhps m2, m2
|
movlhps m2, m2
|
||||||
shufps m1, m1, q0110
|
shufps m1, m1, q0110
|
||||||
mulps m3, m2, m0
|
mulps m3, m2, m0
|
||||||
@ -501,7 +503,7 @@ align 16
|
|||||||
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
addps m5, m4 ; real_sum2 += x[i][0] * x[i + 2][0], x[i][1] * x[i + 2][1]; imag_sum2 += x[i][0] * x[i + 2][1], x[i][1] * x[i + 2][0];
|
||||||
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
addps m7, m2 ; real_sum0 += x[i][0] * x[i][0], x[i][1] * x[i][1];
|
||||||
add cntq, 8
|
add cntq, 8
|
||||||
movlps m2, [xq+cntq+16]
|
MOVH m2, [xq+cntq+16]
|
||||||
movlhps m0, m0
|
movlhps m0, m0
|
||||||
shufps m2, m2, q0110
|
shufps m2, m2, q0110
|
||||||
mulps m3, m0, m1
|
mulps m3, m0, m1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user