factor sum_abs_dctelem out of dct_sad, and simd it.
sum_abs_dctelem_* alone: core2: c=186 mmx2=39 sse2=21 ssse3=13 (cycles) k8: c=163 mmx2=33 sse2=31 p4: c=370 mmx2=60 sse2=60 dct_sad including sum_abs_dctelem_*: core2: c=405 mmx2=258 sse2=240 ssse3=232 k8: c=624 mmx2=394 sse2=392 p4: c=849 mmx2=556 sse2=556 Originally committed as revision 9001 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
561f940c03
commit
1edbfe1994
@ -592,6 +592,14 @@ static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_siz
|
||||
}
|
||||
}
|
||||
|
||||
static int sum_abs_dctelem_c(DCTELEM *block)
|
||||
{
|
||||
int sum=0, i;
|
||||
for(i=0; i<64; i++)
|
||||
sum+= FFABS(block[i]);
|
||||
return sum;
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
||||
#define PIXOP2(OPNAME, OP) \
|
||||
@ -3385,19 +3393,14 @@ static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_
|
||||
|
||||
static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
|
||||
MpegEncContext * const s= (MpegEncContext *)c;
|
||||
DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
|
||||
DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
|
||||
DCTELEM * const temp= (DCTELEM*)aligned_temp;
|
||||
int sum=0, i;
|
||||
|
||||
assert(h==8);
|
||||
|
||||
s->dsp.diff_pixels(temp, src1, src2, stride);
|
||||
s->dsp.fdct(temp);
|
||||
|
||||
for(i=0; i<64; i++)
|
||||
sum+= FFABS(temp[i]);
|
||||
|
||||
return sum;
|
||||
return s->dsp.sum_abs_dctelem(temp);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_GPL
|
||||
@ -3905,6 +3908,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)
|
||||
c->add_pixels_clamped = add_pixels_clamped_c;
|
||||
c->add_pixels8 = add_pixels8_c;
|
||||
c->add_pixels4 = add_pixels4_c;
|
||||
c->sum_abs_dctelem = sum_abs_dctelem_c;
|
||||
c->gmc1 = gmc1_c;
|
||||
c->gmc = ff_gmc_c;
|
||||
c->clear_blocks = clear_blocks_c;
|
||||
|
@ -163,6 +163,7 @@ typedef struct DSPContext {
|
||||
void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
|
||||
void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
|
||||
void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
|
||||
int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
|
||||
/**
|
||||
* translational global motion compensation.
|
||||
*/
|
||||
|
@ -1649,6 +1649,9 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
|
||||
"movq "#c", "#o"+16(%1) \n\t"\
|
||||
"movq "#d", "#o"+24(%1) \n\t"\
|
||||
|
||||
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
||||
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
||||
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
|
||||
#define HSUM_MMX(a, t, dst)\
|
||||
"movq "#a", "#t" \n\t"\
|
||||
"psrlq $32, "#a" \n\t"\
|
||||
@ -1802,6 +1805,71 @@ HADAMARD8_DIFF_SSE2(ssse3)
|
||||
#undef MMABS_SUM_8x8
|
||||
#endif
|
||||
|
||||
#define DCT_SAD4(m,mm,o)\
|
||||
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
|
||||
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
|
||||
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
|
||||
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
|
||||
MMABS_SUM(mm##2, mm##6, mm##0)\
|
||||
MMABS_SUM(mm##3, mm##7, mm##1)\
|
||||
MMABS_SUM(mm##4, mm##6, mm##0)\
|
||||
MMABS_SUM(mm##5, mm##7, mm##1)\
|
||||
|
||||
#define DCT_SAD_MMX\
|
||||
"pxor %%mm0, %%mm0 \n\t"\
|
||||
"pxor %%mm1, %%mm1 \n\t"\
|
||||
DCT_SAD4(q, %%mm, 0)\
|
||||
DCT_SAD4(q, %%mm, 8)\
|
||||
DCT_SAD4(q, %%mm, 64)\
|
||||
DCT_SAD4(q, %%mm, 72)\
|
||||
"paddusw %%mm1, %%mm0 \n\t"\
|
||||
HSUM(%%mm0, %%mm1, %0)
|
||||
|
||||
#define DCT_SAD_SSE2\
|
||||
"pxor %%xmm0, %%xmm0 \n\t"\
|
||||
"pxor %%xmm1, %%xmm1 \n\t"\
|
||||
DCT_SAD4(dqa, %%xmm, 0)\
|
||||
DCT_SAD4(dqa, %%xmm, 64)\
|
||||
"paddusw %%xmm1, %%xmm0 \n\t"\
|
||||
HSUM(%%xmm0, %%xmm1, %0)
|
||||
|
||||
#define DCT_SAD_FUNC(cpu) \
|
||||
static int sum_abs_dctelem_##cpu(DCTELEM *block){\
|
||||
int sum;\
|
||||
asm volatile(\
|
||||
DCT_SAD\
|
||||
:"=r"(sum)\
|
||||
:"r"(block)\
|
||||
);\
|
||||
return sum&0xFFFF;\
|
||||
}
|
||||
|
||||
#define DCT_SAD DCT_SAD_MMX
|
||||
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
|
||||
#define MMABS(a,z) MMABS_MMX(a,z)
|
||||
DCT_SAD_FUNC(mmx)
|
||||
#undef MMABS
|
||||
#undef HSUM
|
||||
|
||||
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
|
||||
#define MMABS(a,z) MMABS_MMX2(a,z)
|
||||
DCT_SAD_FUNC(mmx2)
|
||||
#undef HSUM
|
||||
#undef DCT_SAD
|
||||
|
||||
#define DCT_SAD DCT_SAD_SSE2
|
||||
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
|
||||
DCT_SAD_FUNC(sse2)
|
||||
#undef MMABS
|
||||
|
||||
#ifdef HAVE_SSSE3
|
||||
#define MMABS(a,z) MMABS_SSSE3(a,z)
|
||||
DCT_SAD_FUNC(ssse3)
|
||||
#undef MMABS
|
||||
#endif
|
||||
#undef HSUM
|
||||
#undef DCT_SAD
|
||||
|
||||
static int ssd_int8_vs_int16_mmx(int8_t *pix1, int16_t *pix2, int size){
|
||||
int sum;
|
||||
long i=size;
|
||||
@ -3298,6 +3366,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->add_bytes= add_bytes_mmx;
|
||||
#ifdef CONFIG_ENCODERS
|
||||
c->diff_bytes= diff_bytes_mmx;
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
|
||||
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_mmx;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_mmx;
|
||||
@ -3350,6 +3419,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
|
||||
|
||||
#ifdef CONFIG_ENCODERS
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
|
||||
c->vsad[4]= vsad_intra16_mmx2;
|
||||
@ -3569,12 +3639,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
|
||||
#ifdef CONFIG_ENCODERS
|
||||
if(mm_flags & MM_SSE2){
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_sse2;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_sse2;
|
||||
}
|
||||
|
||||
#ifdef HAVE_SSSE3
|
||||
if(mm_flags & MM_SSSE3){
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user