refactor and optimize scalarproduct
29-105% faster apply_filter, 6-90% faster ape decoding on core2 (Any x86 other than core2 probably gets much less, since this is mostly due to ssse3 cachesplit avoidance and I haven't written the full gamut of other cachesplit modes.) 9-123% faster ape decoding on G4. Originally committed as revision 20739 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
e470691aa8
commit
b1159ad928
@ -648,22 +648,16 @@ static void init_filter(APEContext * ctx, APEFilter *f, int16_t * buf, int order
|
||||
do_init_filter(&f[1], buf + order * 3 + HISTORY_SIZE, order);
|
||||
}
|
||||
|
||||
static inline void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
|
||||
static void do_apply_filter(APEContext * ctx, int version, APEFilter *f, int32_t *data, int count, int order, int fracbits)
|
||||
{
|
||||
int res;
|
||||
int absres;
|
||||
|
||||
while (count--) {
|
||||
/* round fixedpoint scalar product */
|
||||
res = (ctx->dsp.scalarproduct_int16(f->delay - order, f->coeffs, order, 0) + (1 << (fracbits - 1))) >> fracbits;
|
||||
|
||||
if (*data < 0)
|
||||
ctx->dsp.add_int16(f->coeffs, f->adaptcoeffs - order, order);
|
||||
else if (*data > 0)
|
||||
ctx->dsp.sub_int16(f->coeffs, f->adaptcoeffs - order, order);
|
||||
|
||||
res = ctx->dsp.scalarproduct_and_madd_int16(f->coeffs, f->delay - order, f->adaptcoeffs - order, order, APESIGN(*data));
|
||||
res = (res + (1 << (fracbits - 1))) >> fracbits;
|
||||
res += *data;
|
||||
|
||||
*data++ = res;
|
||||
|
||||
/* Update the output history */
|
||||
|
@ -4298,18 +4298,6 @@ void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, i
|
||||
}
|
||||
}
|
||||
|
||||
static void add_int16_c(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
while (order--)
|
||||
*v1++ += *v2++;
|
||||
}
|
||||
|
||||
static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
while (order--)
|
||||
*v1++ -= *v2++;
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
|
||||
{
|
||||
int res = 0;
|
||||
@ -4320,6 +4308,16 @@ static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int
|
||||
return res;
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
|
||||
{
|
||||
int res = 0;
|
||||
while (order--) {
|
||||
res += *v1 * *v2++;
|
||||
*v1++ += mul * *v3++;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
#define W0 2048
|
||||
#define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
|
||||
#define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
|
||||
@ -4848,9 +4846,8 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||
c->vector_clipf = vector_clipf_c;
|
||||
c->float_to_int16 = ff_float_to_int16_c;
|
||||
c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
|
||||
c->add_int16 = add_int16_c;
|
||||
c->sub_int16 = sub_int16_c;
|
||||
c->scalarproduct_int16 = scalarproduct_int16_c;
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
|
||||
c->scalarproduct_float = scalarproduct_float_c;
|
||||
c->butterflies_float = butterflies_float_c;
|
||||
c->vector_fmul_scalar = vector_fmul_scalar_c;
|
||||
|
@ -560,23 +560,19 @@ typedef struct DSPContext {
|
||||
void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
|
||||
int * range, int * sum, int edges);
|
||||
|
||||
/* ape functions */
|
||||
/**
|
||||
* Add contents of the second vector to the first one.
|
||||
* @param len length of vectors, should be multiple of 16
|
||||
*/
|
||||
void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
|
||||
/**
|
||||
* Add contents of the second vector to the first one.
|
||||
* @param len length of vectors, should be multiple of 16
|
||||
*/
|
||||
void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
|
||||
/**
|
||||
* Calculate scalar product of two vectors.
|
||||
* @param len length of vectors, should be multiple of 16
|
||||
* @param shift number of bits to discard from product
|
||||
*/
|
||||
int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
|
||||
/* ape functions */
|
||||
/**
|
||||
* Calculate scalar product of v1 and v2,
|
||||
* and v1[i] += v3[i] * mul
|
||||
* @param len length of vectors, should be multiple of 16
|
||||
*/
|
||||
int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, int16_t *v2, int16_t *v3, int len, int mul);
|
||||
|
||||
/* rv30 functions */
|
||||
qpel_mc_func put_rv30_tpel_pixels_tab[4][16];
|
||||
|
@ -79,34 +79,6 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
|
||||
return u.score[3];
|
||||
}
|
||||
|
||||
static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
int i;
|
||||
register vec_s16 vec, *pv;
|
||||
|
||||
for(i = 0; i < order; i += 8){
|
||||
pv = (vec_s16*)v2;
|
||||
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
|
||||
vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
|
||||
{
|
||||
int i;
|
||||
register vec_s16 vec, *pv;
|
||||
|
||||
for(i = 0; i < order; i += 8){
|
||||
pv = (vec_s16*)v2;
|
||||
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
|
||||
vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
|
||||
v1 += 8;
|
||||
v2 += 8;
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order, const int shift)
|
||||
{
|
||||
int i;
|
||||
@ -137,10 +109,44 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order
|
||||
return ires;
|
||||
}
|
||||
|
||||
static int32_t scalarproduct_and_madd_int16_altivec(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
|
||||
{
|
||||
LOAD_ZERO;
|
||||
vec_s16 *pv1 = (vec_s16*)v1;
|
||||
vec_s16 *pv2 = (vec_s16*)v2;
|
||||
vec_s16 *pv3 = (vec_s16*)v3;
|
||||
register vec_s16 muls = {mul,mul,mul,mul,mul,mul,mul,mul};
|
||||
register vec_s16 t0, t1, i0, i1;
|
||||
register vec_s16 i2 = pv2[0], i3 = pv3[0];
|
||||
register vec_s32 res = zero_s32v;
|
||||
register vec_u8 align = vec_lvsl(0, v2);
|
||||
int32_t ires;
|
||||
order >>= 4;
|
||||
do {
|
||||
t0 = vec_perm(i2, pv2[1], align);
|
||||
i2 = pv2[2];
|
||||
t1 = vec_perm(pv2[1], i2, align);
|
||||
i0 = pv1[0];
|
||||
i1 = pv1[1];
|
||||
res = vec_msum(t0, i0, res);
|
||||
res = vec_msum(t1, i1, res);
|
||||
t0 = vec_perm(i3, pv3[1], align);
|
||||
i3 = pv3[2];
|
||||
t1 = vec_perm(pv3[1], i3, align);
|
||||
pv1[0] = vec_mladd(t0, muls, i0);
|
||||
pv1[1] = vec_mladd(t1, muls, i1);
|
||||
pv1 += 2;
|
||||
pv2 += 2;
|
||||
pv3 += 2;
|
||||
} while(--order);
|
||||
res = vec_splat(vec_sums(res, zero_s32v), 3);
|
||||
vec_ste(res, 0, &ires);
|
||||
return ires;
|
||||
}
|
||||
|
||||
void int_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
||||
{
|
||||
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_altivec;
|
||||
c->add_int16 = add_int16_altivec;
|
||||
c->sub_int16 = sub_int16_altivec;
|
||||
c->scalarproduct_int16 = scalarproduct_int16_altivec;
|
||||
c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_altivec;
|
||||
}
|
||||
|
@ -2384,12 +2384,11 @@ static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
|
||||
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
|
||||
void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
|
||||
void ff_add_int16_mmx2(int16_t * v1, int16_t * v2, int order);
|
||||
void ff_add_int16_sse2(int16_t * v1, int16_t * v2, int order);
|
||||
void ff_sub_int16_mmx2(int16_t * v1, int16_t * v2, int order);
|
||||
void ff_sub_int16_sse2(int16_t * v1, int16_t * v2, int order);
|
||||
int32_t ff_scalarproduct_int16_mmx2(int16_t * v1, int16_t * v2, int order, int shift);
|
||||
int32_t ff_scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift);
|
||||
int32_t ff_scalarproduct_int16_mmx2(int16_t *v1, int16_t *v2, int order, int shift);
|
||||
int32_t ff_scalarproduct_int16_sse2(int16_t *v1, int16_t *v2, int order, int shift);
|
||||
int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
|
||||
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul);
|
||||
void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
|
||||
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
|
||||
@ -2951,9 +2950,8 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
}
|
||||
if(mm_flags & FF_MM_MMX2){
|
||||
#if HAVE_YASM
|
||||
c->add_int16 = ff_add_int16_mmx2;
|
||||
c->sub_int16 = ff_sub_int16_mmx2;
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
|
||||
#endif
|
||||
}
|
||||
if(mm_flags & FF_MM_SSE){
|
||||
@ -2975,11 +2973,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->float_to_int16 = float_to_int16_sse2;
|
||||
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
|
||||
#if HAVE_YASM
|
||||
c->add_int16 = ff_add_int16_sse2;
|
||||
c->sub_int16 = ff_sub_int16_sse2;
|
||||
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
|
||||
#endif
|
||||
}
|
||||
if((mm_flags & FF_MM_SSSE3) && !(mm_flags & (FF_MM_SSE42|FF_MM_3DNOW)) && HAVE_YASM) // cachesplit
|
||||
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
|
||||
}
|
||||
|
||||
if (CONFIG_ENCODERS)
|
||||
|
@ -100,43 +100,7 @@ FLOAT_TO_INT16_INTERLEAVE6 3dn2
|
||||
|
||||
|
||||
%macro SCALARPRODUCT 1
|
||||
; void add_int16(int16_t * v1, int16_t * v2, int order)
|
||||
cglobal add_int16_%1, 3,3,2, v1, v2, order
|
||||
shl orderq, 1
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
movu m0, [v2q + orderq]
|
||||
movu m1, [v2q + orderq + mmsize]
|
||||
paddw m0, [v1q + orderq]
|
||||
paddw m1, [v1q + orderq + mmsize]
|
||||
mova [v1q + orderq], m0
|
||||
mova [v1q + orderq + mmsize], m1
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
; void sub_int16(int16_t * v1, int16_t * v2, int order)
|
||||
cglobal sub_int16_%1, 3,3,4, v1, v2, order
|
||||
shl orderq, 1
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
movu m2, [v2q + orderq]
|
||||
movu m3, [v2q + orderq + mmsize]
|
||||
mova m0, [v1q + orderq]
|
||||
mova m1, [v1q + orderq + mmsize]
|
||||
psubw m0, m2
|
||||
psubw m1, m3
|
||||
mova [v1q + orderq], m0
|
||||
mova [v1q + orderq + mmsize], m1
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
REP_RET
|
||||
|
||||
; int scalarproduct_int16_sse2(int16_t * v1, int16_t * v2, int order, int shift)
|
||||
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
|
||||
cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
|
||||
shl orderq, 1
|
||||
add v1q, orderq
|
||||
@ -165,6 +129,51 @@ cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
|
||||
paddd m2, m0
|
||||
movd eax, m2
|
||||
RET
|
||||
|
||||
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
|
||||
cglobal scalarproduct_and_madd_int16_%1, 3,4,8, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
%if mmsize == 16
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
%else
|
||||
pshufw m7, m7, 0
|
||||
%endif
|
||||
pxor m6, m6
|
||||
add v1q, orderq
|
||||
add v2q, orderq
|
||||
add v3q, orderq
|
||||
neg orderq
|
||||
.loop:
|
||||
movu m0, [v2q + orderq]
|
||||
movu m1, [v2q + orderq + mmsize]
|
||||
mova m4, [v1q + orderq]
|
||||
mova m5, [v1q + orderq + mmsize]
|
||||
movu m2, [v3q + orderq]
|
||||
movu m3, [v3q + orderq + mmsize]
|
||||
pmaddwd m0, m4
|
||||
pmaddwd m1, m5
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
paddw m2, m4
|
||||
paddw m3, m5
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
add orderq, mmsize*2
|
||||
jl .loop
|
||||
%if mmsize == 16
|
||||
movhlps m0, m6
|
||||
paddd m6, m0
|
||||
pshuflw m0, m6, 0x4e
|
||||
%else
|
||||
pshufw m0, m6, 0x4e
|
||||
%endif
|
||||
paddd m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
@ -172,6 +181,87 @@ SCALARPRODUCT mmx2
|
||||
INIT_XMM
|
||||
SCALARPRODUCT sse2
|
||||
|
||||
%macro SCALARPRODUCT_LOOP 1
|
||||
align 16
|
||||
.loop%1:
|
||||
sub orderq, mmsize*2
|
||||
%if %1
|
||||
mova m1, m4
|
||||
mova m4, [v2q + orderq]
|
||||
mova m0, [v2q + orderq + mmsize]
|
||||
palignr m1, m0, %1
|
||||
palignr m0, m4, %1
|
||||
mova m3, m5
|
||||
mova m5, [v3q + orderq]
|
||||
mova m2, [v3q + orderq + mmsize]
|
||||
palignr m3, m2, %1
|
||||
palignr m2, m5, %1
|
||||
%else
|
||||
mova m0, [v2q + orderq]
|
||||
mova m1, [v2q + orderq + mmsize]
|
||||
mova m2, [v3q + orderq]
|
||||
mova m3, [v3q + orderq + mmsize]
|
||||
%endif
|
||||
pmaddwd m0, [v1q + orderq]
|
||||
pmaddwd m1, [v1q + orderq + mmsize]
|
||||
pmullw m2, m7
|
||||
pmullw m3, m7
|
||||
paddw m2, [v1q + orderq]
|
||||
paddw m3, [v1q + orderq + mmsize]
|
||||
paddd m6, m0
|
||||
paddd m6, m1
|
||||
mova [v1q + orderq], m2
|
||||
mova [v1q + orderq + mmsize], m3
|
||||
jg .loop%1
|
||||
%if %1
|
||||
jmp .end
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
|
||||
cglobal scalarproduct_and_madd_int16_ssse3, 4,5,8, v1, v2, v3, order, mul
|
||||
shl orderq, 1
|
||||
movd m7, mulm
|
||||
pshuflw m7, m7, 0
|
||||
punpcklqdq m7, m7
|
||||
pxor m6, m6
|
||||
mov r4d, v2d
|
||||
and r4d, 15
|
||||
and v2q, ~15
|
||||
and v3q, ~15
|
||||
mova m4, [v2q + orderq]
|
||||
mova m5, [v3q + orderq]
|
||||
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
|
||||
cmp r4d, 0
|
||||
je .loop0
|
||||
cmp r4d, 2
|
||||
je .loop2
|
||||
cmp r4d, 4
|
||||
je .loop4
|
||||
cmp r4d, 6
|
||||
je .loop6
|
||||
cmp r4d, 8
|
||||
je .loop8
|
||||
cmp r4d, 10
|
||||
je .loop10
|
||||
cmp r4d, 12
|
||||
je .loop12
|
||||
SCALARPRODUCT_LOOP 14
|
||||
SCALARPRODUCT_LOOP 12
|
||||
SCALARPRODUCT_LOOP 10
|
||||
SCALARPRODUCT_LOOP 8
|
||||
SCALARPRODUCT_LOOP 6
|
||||
SCALARPRODUCT_LOOP 4
|
||||
SCALARPRODUCT_LOOP 2
|
||||
SCALARPRODUCT_LOOP 0
|
||||
.end:
|
||||
movhlps m0, m6
|
||||
paddd m6, m0
|
||||
pshuflw m0, m6, 0x4e
|
||||
paddd m6, m0
|
||||
movd eax, m6
|
||||
RET
|
||||
|
||||
|
||||
|
||||
; void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
|
||||
|
Loading…
Reference in New Issue
Block a user