Merge commit '4a7af92cc80ced8498626401ed21f25ffe6740c8'
* commit '4a7af92cc80ced8498626401ed21f25ffe6740c8': sbrdsp: Unroll and use integer operations sbrdsp: Unroll sbr_autocorrelate_c x86: sbrdsp: Implement SSE2 qmf_deint_bfly Conflicts: libavcodec/sbrdsp.c libavcodec/x86/sbrdsp.asm libavcodec/x86/sbrdsp_init.c Merged-by: Michael Niedermayer <michaelni@gmx.at>
This commit is contained in:
commit
711c8ee71d
@ -52,41 +52,41 @@ static float sbr_sum_square_c(float (*x)[2], int n)
|
||||
|
||||
static void sbr_neg_odd_64_c(float *x)
|
||||
{
|
||||
union av_intfloat32 *xi = (union av_intfloat32*)x;
|
||||
union av_intfloat32 *xi = (union av_intfloat32*) x;
|
||||
int i;
|
||||
for (i = 1; i < 64; i += 4)
|
||||
{
|
||||
xi[i+0].i ^= 1U<<31;
|
||||
xi[i+2].i ^= 1U<<31;
|
||||
for (i = 1; i < 64; i += 4) {
|
||||
xi[i + 0].i ^= 1U << 31;
|
||||
xi[i + 2].i ^= 1U << 31;
|
||||
}
|
||||
}
|
||||
|
||||
static void sbr_qmf_pre_shuffle_c(float *z)
|
||||
{
|
||||
union av_intfloat32 *zi = (union av_intfloat32*)z;
|
||||
union av_intfloat32 *zi = (union av_intfloat32*) z;
|
||||
int k;
|
||||
zi[64].i = zi[0].i;
|
||||
zi[65].i = zi[1].i;
|
||||
for (k = 1; k < 31; k+=2) {
|
||||
zi[64+2*k+0].i = zi[64 - k].i ^ (1U<<31);
|
||||
zi[64+2*k+1].i = zi[ k + 1].i;
|
||||
zi[64+2*k+2].i = zi[63 - k].i ^ (1U<<31);
|
||||
zi[64+2*k+3].i = zi[ k + 2].i;
|
||||
for (k = 1; k < 31; k += 2) {
|
||||
zi[64 + 2 * k + 0].i = zi[64 - k].i ^ (1U << 31);
|
||||
zi[64 + 2 * k + 1].i = zi[ k + 1].i;
|
||||
zi[64 + 2 * k + 2].i = zi[63 - k].i ^ (1U << 31);
|
||||
zi[64 + 2 * k + 3].i = zi[ k + 2].i;
|
||||
}
|
||||
zi[64+2*31+0].i = zi[64 - 31].i ^ (1U<<31);
|
||||
zi[64+2*31+1].i = zi[31 + 1].i;
|
||||
|
||||
zi[64 + 2 * 31 + 0].i = zi[64 - 31].i ^ (1U << 31);
|
||||
zi[64 + 2 * 31 + 1].i = zi[31 + 1].i;
|
||||
}
|
||||
|
||||
static void sbr_qmf_post_shuffle_c(float W[32][2], const float *z)
|
||||
{
|
||||
const union av_intfloat32 *zi = (const union av_intfloat32*)z;
|
||||
union av_intfloat32 *Wi = (union av_intfloat32*)W;
|
||||
const union av_intfloat32 *zi = (const union av_intfloat32*) z;
|
||||
union av_intfloat32 *Wi = (union av_intfloat32*) W;
|
||||
int k;
|
||||
for (k = 0; k < 32; k+=2) {
|
||||
Wi[2*k+0].i = zi[63-k].i ^ (1U<<31);
|
||||
Wi[2*k+1].i = zi[k+0].i;
|
||||
Wi[2*k+2].i = zi[62-k].i ^ (1U<<31);
|
||||
Wi[2*k+3].i = zi[k+1].i;
|
||||
for (k = 0; k < 32; k += 2) {
|
||||
Wi[2 * k + 0].i = zi[63 - k].i ^ (1U << 31);
|
||||
Wi[2 * k + 1].i = zi[ k + 0].i;
|
||||
Wi[2 * k + 2].i = zi[62 - k].i ^ (1U << 31);
|
||||
Wi[2 * k + 3].i = zi[ k + 1].i;
|
||||
}
|
||||
}
|
||||
|
||||
@ -96,8 +96,8 @@ static void sbr_qmf_deint_neg_c(float *v, const float *src)
|
||||
union av_intfloat32 *vi = (union av_intfloat32*)v;
|
||||
int i;
|
||||
for (i = 0; i < 32; i++) {
|
||||
vi[ i].i = si[63 - 2*i ].i;
|
||||
vi[63 - i].i = si[63 - 2*i - 1].i ^ (1U<<31);
|
||||
vi[ i].i = si[63 - 2 * i ].i;
|
||||
vi[63 - i].i = si[63 - 2 * i - 1].i ^ (1U << 31);
|
||||
}
|
||||
}
|
||||
|
||||
@ -139,32 +139,32 @@ static av_always_inline void autocorrelate(const float x[40][2],
|
||||
static void sbr_autocorrelate_c(const float x[40][2], float phi[3][2][2])
|
||||
{
|
||||
#if 0
|
||||
// This code is slower because it multiplies memory accesses.
|
||||
// It is left as eucational purpose and because it may offer
|
||||
// a better reference for writing arch-specific dsp functions.
|
||||
/* This code is slower because it multiplies memory accesses.
|
||||
* It is left for educational purposes and because it may offer
|
||||
* a better reference for writing arch-specific DSP functions. */
|
||||
autocorrelate(x, phi, 0);
|
||||
autocorrelate(x, phi, 1);
|
||||
autocorrelate(x, phi, 2);
|
||||
#else
|
||||
float real_sum2 = x[ 0][0] * x[ 2][0] + x[ 0][1] * x[ 2][1];
|
||||
float imag_sum2 = x[ 0][0] * x[ 2][1] - x[ 0][1] * x[ 2][0];
|
||||
float real_sum1 = 0.f, imag_sum1 = 0.f, real_sum0 = 0.0f;
|
||||
float real_sum2 = x[0][0] * x[2][0] + x[0][1] * x[2][1];
|
||||
float imag_sum2 = x[0][0] * x[2][1] - x[0][1] * x[2][0];
|
||||
float real_sum1 = 0.0f, imag_sum1 = 0.0f, real_sum0 = 0.0f;
|
||||
int i;
|
||||
for (i = 1; i < 38; i++) {
|
||||
real_sum0 += x[i][0] * x[i ][0] + x[i][1] * x[i ][1];
|
||||
real_sum1 += x[i][0] * x[i+1][0] + x[i][1] * x[i+1][1];
|
||||
imag_sum1 += x[i][0] * x[i+1][1] - x[i][1] * x[i+1][0];
|
||||
real_sum2 += x[i][0] * x[i+2][0] + x[i][1] * x[i+2][1];
|
||||
imag_sum2 += x[i][0] * x[i+2][1] - x[i][1] * x[i+2][0];
|
||||
real_sum0 += x[i][0] * x[i ][0] + x[i][1] * x[i ][1];
|
||||
real_sum1 += x[i][0] * x[i + 1][0] + x[i][1] * x[i + 1][1];
|
||||
imag_sum1 += x[i][0] * x[i + 1][1] - x[i][1] * x[i + 1][0];
|
||||
real_sum2 += x[i][0] * x[i + 2][0] + x[i][1] * x[i + 2][1];
|
||||
imag_sum2 += x[i][0] * x[i + 2][1] - x[i][1] * x[i + 2][0];
|
||||
}
|
||||
phi[2-2][1][0] = real_sum2;
|
||||
phi[2-2][1][1] = imag_sum2;
|
||||
phi[2 ][1][0] = real_sum0 + x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1];
|
||||
phi[1 ][0][0] = real_sum0 + x[38][0] * x[38][0] + x[38][1] * x[38][1];
|
||||
phi[2-1][1][0] = real_sum1 + x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1];
|
||||
phi[2-1][1][1] = imag_sum1 + x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0];
|
||||
phi[0 ][0][0] = real_sum1 + x[38][0] * x[39][0] + x[38][1] * x[39][1];
|
||||
phi[0 ][0][1] = imag_sum1 + x[38][0] * x[39][1] - x[38][1] * x[39][0];
|
||||
phi[2 - 2][1][0] = real_sum2;
|
||||
phi[2 - 2][1][1] = imag_sum2;
|
||||
phi[2 ][1][0] = real_sum0 + x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1];
|
||||
phi[1 ][0][0] = real_sum0 + x[38][0] * x[38][0] + x[38][1] * x[38][1];
|
||||
phi[2 - 1][1][0] = real_sum1 + x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1];
|
||||
phi[2 - 1][1][1] = imag_sum1 + x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0];
|
||||
phi[0 ][0][0] = real_sum1 + x[38][0] * x[39][0] + x[38][1] * x[39][1];
|
||||
phi[0 ][0][1] = imag_sum1 + x[38][0] * x[39][1] - x[38][1] * x[39][0];
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -252,36 +252,36 @@ cglobal sbr_neg_odd_64, 1,2,4,z
|
||||
; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
|
||||
%macro SBR_QMF_DEINT_BFLY 0
|
||||
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
|
||||
mov cq, 64*4-2*mmsize
|
||||
lea vrevq, [vq + 64*4]
|
||||
mov cq, 64*4-2*mmsize
|
||||
lea vrevq, [vq + 64*4]
|
||||
.loop:
|
||||
mova m0, [src0q+cq]
|
||||
mova m1, [src1q]
|
||||
mova m4, [src0q+cq+mmsize]
|
||||
mova m5, [src1q+mmsize]
|
||||
mova m0, [src0q+cq]
|
||||
mova m1, [src1q]
|
||||
mova m4, [src0q+cq+mmsize]
|
||||
mova m5, [src1q+mmsize]
|
||||
%if cpuflag(sse2)
|
||||
pshufd m2, m0, q0123
|
||||
pshufd m3, m1, q0123
|
||||
pshufd m6, m4, q0123
|
||||
pshufd m7, m5, q0123
|
||||
pshufd m2, m0, q0123
|
||||
pshufd m3, m1, q0123
|
||||
pshufd m6, m4, q0123
|
||||
pshufd m7, m5, q0123
|
||||
%else
|
||||
shufps m2, m0, m0, q0123
|
||||
shufps m3, m1, m1, q0123
|
||||
shufps m6, m4, m4, q0123
|
||||
shufps m7, m5, m5, q0123
|
||||
shufps m2, m0, m0, q0123
|
||||
shufps m3, m1, m1, q0123
|
||||
shufps m6, m4, m4, q0123
|
||||
shufps m7, m5, m5, q0123
|
||||
%endif
|
||||
addps m5, m2
|
||||
subps m0, m7
|
||||
addps m1, m6
|
||||
subps m4, m3
|
||||
mova [vrevq], m1
|
||||
addps m5, m2
|
||||
subps m0, m7
|
||||
addps m1, m6
|
||||
subps m4, m3
|
||||
mova [vrevq], m1
|
||||
mova [vrevq+mmsize], m5
|
||||
mova [vq+cq], m0
|
||||
mova [vq+cq], m0
|
||||
mova [vq+cq+mmsize], m4
|
||||
add src1q, 2*mmsize
|
||||
add vrevq, 2*mmsize
|
||||
sub cq, 2*mmsize
|
||||
jge .loop
|
||||
add src1q, 2*mmsize
|
||||
add vrevq, 2*mmsize
|
||||
sub cq, 2*mmsize
|
||||
jge .loop
|
||||
REP_RET
|
||||
%endmacro
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user