Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
which will hopefully solve the Win64/FATE failures caused by these functions. Originally committed as revision 25137 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
d0acc2d2e9
commit
e2e341048e
@ -879,55 +879,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
|
|||||||
*left = src2[w-1];
|
*left = src2[w-1];
|
||||||
}
|
}
|
||||||
|
|
||||||
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
|
|
||||||
"mov"#m" "#p1", "#a" \n\t"\
|
|
||||||
"mov"#m" "#p2", "#t" \n\t"\
|
|
||||||
"punpcklbw "#a", "#t" \n\t"\
|
|
||||||
"punpcklbw "#a", "#a" \n\t"\
|
|
||||||
"psubw "#t", "#a" \n\t"\
|
|
||||||
|
|
||||||
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
|
|
||||||
uint8_t *p1b=p1, *p2b=p2;\
|
|
||||||
__asm__ volatile(\
|
|
||||||
DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
|
|
||||||
DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
|
|
||||||
DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
|
|
||||||
"add %4, %1 \n\t"\
|
|
||||||
"add %4, %2 \n\t"\
|
|
||||||
DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
|
|
||||||
DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
|
|
||||||
DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
|
|
||||||
DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
|
|
||||||
"mov"#m1" "#mm"0, %0 \n\t"\
|
|
||||||
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
|
|
||||||
"mov"#m1" %0, "#mm"0 \n\t"\
|
|
||||||
: "+m"(temp), "+r"(p1b), "+r"(p2b)\
|
|
||||||
: "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
|
|
||||||
);\
|
|
||||||
}
|
|
||||||
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
|
|
||||||
|
|
||||||
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
|
|
||||||
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
|
|
||||||
|
|
||||||
#define LBUTTERFLY2(a1,b1,a2,b2)\
|
|
||||||
"paddw " #b1 ", " #a1 " \n\t"\
|
|
||||||
"paddw " #b2 ", " #a2 " \n\t"\
|
|
||||||
"paddw " #b1 ", " #b1 " \n\t"\
|
|
||||||
"paddw " #b2 ", " #b2 " \n\t"\
|
|
||||||
"psubw " #a1 ", " #b1 " \n\t"\
|
|
||||||
"psubw " #a2 ", " #b2 " \n\t"
|
|
||||||
|
|
||||||
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
|
|
||||||
LBUTTERFLY2(m0, m1, m2, m3)\
|
|
||||||
LBUTTERFLY2(m4, m5, m6, m7)\
|
|
||||||
LBUTTERFLY2(m0, m2, m1, m3)\
|
|
||||||
LBUTTERFLY2(m4, m6, m5, m7)\
|
|
||||||
LBUTTERFLY2(m0, m4, m1, m5)\
|
|
||||||
LBUTTERFLY2(m2, m6, m3, m7)\
|
|
||||||
|
|
||||||
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
|
|
||||||
|
|
||||||
#define MMABS_MMX(a,z)\
|
#define MMABS_MMX(a,z)\
|
||||||
"pxor " #z ", " #z " \n\t"\
|
"pxor " #z ", " #z " \n\t"\
|
||||||
"pcmpgtw " #a ", " #z " \n\t"\
|
"pcmpgtw " #a ", " #z " \n\t"\
|
||||||
@ -946,34 +897,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
|
|||||||
MMABS(a,z)\
|
MMABS(a,z)\
|
||||||
"paddusw " #a ", " #sum " \n\t"
|
"paddusw " #a ", " #sum " \n\t"
|
||||||
|
|
||||||
#define MMABS_SUM_8x8_NOSPILL\
|
|
||||||
MMABS(%%xmm0, %%xmm8)\
|
|
||||||
MMABS(%%xmm1, %%xmm9)\
|
|
||||||
MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
|
|
||||||
MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
|
|
||||||
MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
|
|
||||||
MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
|
|
||||||
MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
|
|
||||||
MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
|
|
||||||
"paddusw %%xmm1, %%xmm0 \n\t"
|
|
||||||
|
|
||||||
#if ARCH_X86_64
|
|
||||||
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
|
|
||||||
#else
|
|
||||||
#define MMABS_SUM_8x8_SSE2\
|
|
||||||
"movdqa %%xmm7, (%1) \n\t"\
|
|
||||||
MMABS(%%xmm0, %%xmm7)\
|
|
||||||
MMABS(%%xmm1, %%xmm7)\
|
|
||||||
MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
|
|
||||||
MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
|
|
||||||
MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
|
|
||||||
MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
|
|
||||||
MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
|
|
||||||
"movdqa (%1), %%xmm2 \n\t"\
|
|
||||||
MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
|
|
||||||
"paddusw %%xmm1, %%xmm0 \n\t"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
||||||
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
||||||
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
|
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
|
||||||
@ -1002,133 +925,16 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
|
|||||||
"paddusw "#t", "#a" \n\t"\
|
"paddusw "#t", "#a" \n\t"\
|
||||||
"movd "#a", "#dst" \n\t"\
|
"movd "#a", "#dst" \n\t"\
|
||||||
|
|
||||||
#define HADAMARD8_DIFF_MMX(cpu) \
|
#define hadamard_func(cpu) \
|
||||||
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
|
int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
|
||||||
DECLARE_ALIGNED(8, uint64_t, temp)[13];\
|
int stride, int h); \
|
||||||
int sum;\
|
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
|
||||||
\
|
int stride, int h);
|
||||||
assert(h==8);\
|
|
||||||
\
|
|
||||||
DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
|
|
||||||
\
|
|
||||||
__asm__ volatile(\
|
|
||||||
HADAMARD48\
|
|
||||||
\
|
|
||||||
"movq %%mm7, 96(%1) \n\t"\
|
|
||||||
\
|
|
||||||
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
|
|
||||||
STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
|
|
||||||
\
|
|
||||||
"movq 96(%1), %%mm7 \n\t"\
|
|
||||||
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
|
|
||||||
STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
|
|
||||||
\
|
|
||||||
: "=r" (sum)\
|
|
||||||
: "r"(temp)\
|
|
||||||
);\
|
|
||||||
\
|
|
||||||
DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
|
|
||||||
\
|
|
||||||
__asm__ volatile(\
|
|
||||||
HADAMARD48\
|
|
||||||
\
|
|
||||||
"movq %%mm7, 96(%1) \n\t"\
|
|
||||||
\
|
|
||||||
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
|
|
||||||
STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
|
|
||||||
\
|
|
||||||
"movq 96(%1), %%mm7 \n\t"\
|
|
||||||
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
|
|
||||||
"movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
|
|
||||||
"movq %%mm6, %%mm7 \n\t"\
|
|
||||||
"movq %%mm0, %%mm6 \n\t"\
|
|
||||||
\
|
|
||||||
LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
|
|
||||||
\
|
|
||||||
HADAMARD48\
|
|
||||||
"movq %%mm7, 64(%1) \n\t"\
|
|
||||||
MMABS(%%mm0, %%mm7)\
|
|
||||||
MMABS(%%mm1, %%mm7)\
|
|
||||||
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
|
|
||||||
MMABS_SUM(%%mm3, %%mm7, %%mm1)\
|
|
||||||
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
|
|
||||||
MMABS_SUM(%%mm5, %%mm7, %%mm1)\
|
|
||||||
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
|
|
||||||
"movq 64(%1), %%mm2 \n\t"\
|
|
||||||
MMABS_SUM(%%mm2, %%mm7, %%mm1)\
|
|
||||||
"paddusw %%mm1, %%mm0 \n\t"\
|
|
||||||
"movq %%mm0, 64(%1) \n\t"\
|
|
||||||
\
|
|
||||||
LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
|
|
||||||
LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
|
|
||||||
\
|
|
||||||
HADAMARD48\
|
|
||||||
"movq %%mm7, (%1) \n\t"\
|
|
||||||
MMABS(%%mm0, %%mm7)\
|
|
||||||
MMABS(%%mm1, %%mm7)\
|
|
||||||
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
|
|
||||||
MMABS_SUM(%%mm3, %%mm7, %%mm1)\
|
|
||||||
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
|
|
||||||
MMABS_SUM(%%mm5, %%mm7, %%mm1)\
|
|
||||||
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
|
|
||||||
"movq (%1), %%mm2 \n\t"\
|
|
||||||
MMABS_SUM(%%mm2, %%mm7, %%mm1)\
|
|
||||||
"paddusw 64(%1), %%mm0 \n\t"\
|
|
||||||
"paddusw %%mm1, %%mm0 \n\t"\
|
|
||||||
\
|
|
||||||
HSUM(%%mm0, %%mm1, %0)\
|
|
||||||
\
|
|
||||||
: "=r" (sum)\
|
|
||||||
: "r"(temp)\
|
|
||||||
);\
|
|
||||||
return sum&0xFFFF;\
|
|
||||||
}\
|
|
||||||
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
|
|
||||||
|
|
||||||
#define HADAMARD8_DIFF_SSE2(cpu) \
|
hadamard_func(mmx)
|
||||||
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
|
hadamard_func(mmx2)
|
||||||
DECLARE_ALIGNED(16, uint64_t, temp)[4];\
|
hadamard_func(sse2)
|
||||||
int sum;\
|
hadamard_func(ssse3)
|
||||||
\
|
|
||||||
assert(h==8);\
|
|
||||||
\
|
|
||||||
DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
|
|
||||||
\
|
|
||||||
__asm__ volatile(\
|
|
||||||
HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
|
|
||||||
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
|
|
||||||
HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
|
|
||||||
MMABS_SUM_8x8\
|
|
||||||
HSUM_SSE2(%%xmm0, %%xmm1, %0)\
|
|
||||||
: "=r" (sum)\
|
|
||||||
: "r"(temp)\
|
|
||||||
);\
|
|
||||||
return sum&0xFFFF;\
|
|
||||||
}\
|
|
||||||
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
|
|
||||||
|
|
||||||
#define MMABS(a,z) MMABS_MMX(a,z)
|
|
||||||
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
|
|
||||||
HADAMARD8_DIFF_MMX(mmx)
|
|
||||||
#undef MMABS
|
|
||||||
#undef HSUM
|
|
||||||
|
|
||||||
#define MMABS(a,z) MMABS_MMX2(a,z)
|
|
||||||
#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
|
|
||||||
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
|
|
||||||
HADAMARD8_DIFF_MMX(mmx2)
|
|
||||||
HADAMARD8_DIFF_SSE2(sse2)
|
|
||||||
#undef MMABS
|
|
||||||
#undef MMABS_SUM_8x8
|
|
||||||
#undef HSUM
|
|
||||||
|
|
||||||
#if HAVE_SSSE3
|
|
||||||
#define MMABS(a,z) MMABS_SSSE3(a,z)
|
|
||||||
#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
|
|
||||||
HADAMARD8_DIFF_SSE2(ssse3)
|
|
||||||
#undef MMABS
|
|
||||||
#undef MMABS_SUM_8x8
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define DCT_SAD4(m,mm,o)\
|
#define DCT_SAD4(m,mm,o)\
|
||||||
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
|
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
|
||||||
@ -1312,8 +1118,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
c->diff_bytes= diff_bytes_mmx;
|
c->diff_bytes= diff_bytes_mmx;
|
||||||
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
|
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
|
||||||
|
|
||||||
c->hadamard8_diff[0]= hadamard8_diff16_mmx;
|
c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
|
||||||
c->hadamard8_diff[1]= hadamard8_diff_mmx;
|
c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
|
||||||
|
|
||||||
c->pix_norm1 = pix_norm1_mmx;
|
c->pix_norm1 = pix_norm1_mmx;
|
||||||
c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
|
c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
|
||||||
@ -1336,8 +1142,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
|
|
||||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||||
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
|
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
|
||||||
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
|
c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
|
||||||
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
|
c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
|
||||||
c->vsad[4]= vsad_intra16_mmx2;
|
c->vsad[4]= vsad_intra16_mmx2;
|
||||||
|
|
||||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||||
@ -1350,8 +1156,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
if(mm_flags & AV_CPU_FLAG_SSE2){
|
if(mm_flags & AV_CPU_FLAG_SSE2){
|
||||||
c->get_pixels = get_pixels_sse2;
|
c->get_pixels = get_pixels_sse2;
|
||||||
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
|
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
|
||||||
c->hadamard8_diff[0]= hadamard8_diff16_sse2;
|
c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
|
||||||
c->hadamard8_diff[1]= hadamard8_diff_sse2;
|
c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
|
if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
|
||||||
@ -1365,8 +1171,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
|||||||
}
|
}
|
||||||
c->add_8x8basis= add_8x8basis_ssse3;
|
c->add_8x8basis= add_8x8basis_ssse3;
|
||||||
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
|
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
|
||||||
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
|
c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
|
||||||
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
|
c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -26,6 +26,261 @@
|
|||||||
|
|
||||||
SECTION .text
|
SECTION .text
|
||||||
|
|
||||||
|
%macro DIFF_PIXELS_1 4
|
||||||
|
movh %1, %3
|
||||||
|
movh %2, %4
|
||||||
|
punpcklbw %2, %1
|
||||||
|
punpcklbw %1, %1
|
||||||
|
psubw %1, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
|
||||||
|
; %6=temporary storage location
|
||||||
|
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
|
||||||
|
%macro DIFF_PIXELS_8 6
|
||||||
|
DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
|
||||||
|
DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
|
||||||
|
DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
||||||
|
add %1, %5
|
||||||
|
add %2, %5
|
||||||
|
DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
|
||||||
|
DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
|
||||||
|
DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
||||||
|
DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
|
||||||
|
%ifdef m8
|
||||||
|
DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
|
||||||
|
%else
|
||||||
|
mova [%6], m0
|
||||||
|
DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
|
||||||
|
mova m0, [%6]
|
||||||
|
%endif
|
||||||
|
sub %1, %5
|
||||||
|
sub %2, %5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HADAMARD8 0
|
||||||
|
SUMSUB_BADC m0, m1, m2, m3
|
||||||
|
SUMSUB_BADC m4, m5, m6, m7
|
||||||
|
SUMSUB_BADC m0, m2, m1, m3
|
||||||
|
SUMSUB_BADC m4, m6, m5, m7
|
||||||
|
SUMSUB_BADC m0, m4, m1, m5
|
||||||
|
SUMSUB_BADC m2, m6, m3, m7
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABS1_SUM 3
|
||||||
|
ABS1 %1, %2
|
||||||
|
paddusw %3, %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABS2_SUM 6
|
||||||
|
ABS2 %1, %2, %3, %4
|
||||||
|
paddusw %5, %1
|
||||||
|
paddusw %6, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABS_SUM_8x8_64 1
|
||||||
|
ABS2 m0, m1, m8, m9
|
||||||
|
ABS2_SUM m2, m3, m8, m9, m0, m1
|
||||||
|
ABS2_SUM m4, m5, m8, m9, m0, m1
|
||||||
|
ABS2_SUM m6, m7, m8, m9, m0, m1
|
||||||
|
paddusw m0, m1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABS_SUM_8x8_32 1
|
||||||
|
mova [%1], m7
|
||||||
|
ABS1 m0, m7
|
||||||
|
ABS1 m1, m7
|
||||||
|
ABS1_SUM m2, m7, m0
|
||||||
|
ABS1_SUM m3, m7, m1
|
||||||
|
ABS1_SUM m4, m7, m0
|
||||||
|
ABS1_SUM m5, m7, m1
|
||||||
|
ABS1_SUM m6, m7, m0
|
||||||
|
mova m2, [%1]
|
||||||
|
ABS1_SUM m2, m7, m1
|
||||||
|
paddusw m0, m1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
||||||
|
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
||||||
|
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
|
||||||
|
%macro HSUM_MMX 3
|
||||||
|
mova %2, %1
|
||||||
|
psrlq %1, 32
|
||||||
|
paddusw %1, %2
|
||||||
|
mova %2, %1
|
||||||
|
psrlq %1, 16
|
||||||
|
paddusw %1, %2
|
||||||
|
movd %3, %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HSUM_MMX2 3
|
||||||
|
pshufw %2, %1, 0xE
|
||||||
|
paddusw %1, %2
|
||||||
|
pshufw %2, %1, 0x1
|
||||||
|
paddusw %1, %2
|
||||||
|
movd %3, %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HSUM_SSE2 3
|
||||||
|
movhlps %2, %1
|
||||||
|
paddusw %1, %2
|
||||||
|
pshuflw %2, %1, 0xE
|
||||||
|
paddusw %1, %2
|
||||||
|
pshuflw %2, %1, 0x1
|
||||||
|
paddusw %1, %2
|
||||||
|
movd %3, %1
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro STORE4 5
|
||||||
|
mova [%1+mmsize*0], %2
|
||||||
|
mova [%1+mmsize*1], %3
|
||||||
|
mova [%1+mmsize*2], %4
|
||||||
|
mova [%1+mmsize*3], %5
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro LOAD4 5
|
||||||
|
mova %2, [%1+mmsize*0]
|
||||||
|
mova %3, [%1+mmsize*1]
|
||||||
|
mova %4, [%1+mmsize*2]
|
||||||
|
mova %5, [%1+mmsize*3]
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro hadamard8_16_wrapper 3
|
||||||
|
cglobal hadamard8_diff_%1, 4, 4, %2
|
||||||
|
%ifndef m8
|
||||||
|
%assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
|
||||||
|
SUB rsp, pad
|
||||||
|
%endif
|
||||||
|
call hadamard8x8_diff_%1
|
||||||
|
%ifndef m8
|
||||||
|
ADD rsp, pad
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
|
||||||
|
cglobal hadamard8_diff16_%1, 5, 6, %2
|
||||||
|
%ifndef m8
|
||||||
|
%assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
|
||||||
|
SUB rsp, pad
|
||||||
|
%endif
|
||||||
|
|
||||||
|
call hadamard8x8_diff_%1
|
||||||
|
mov r5d, eax
|
||||||
|
|
||||||
|
add r1, 8
|
||||||
|
add r2, 8
|
||||||
|
call hadamard8x8_diff_%1
|
||||||
|
add r5d, eax
|
||||||
|
|
||||||
|
cmp r4d, 16
|
||||||
|
jne .done
|
||||||
|
|
||||||
|
lea r1, [r1+r3*8-8]
|
||||||
|
lea r2, [r2+r3*8-8]
|
||||||
|
call hadamard8x8_diff_%1
|
||||||
|
add r5d, eax
|
||||||
|
|
||||||
|
add r1, 8
|
||||||
|
add r2, 8
|
||||||
|
call hadamard8x8_diff_%1
|
||||||
|
add r5d, eax
|
||||||
|
|
||||||
|
.done
|
||||||
|
mov eax, r5d
|
||||||
|
%ifndef m8
|
||||||
|
ADD rsp, pad
|
||||||
|
%endif
|
||||||
|
RET
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HADAMARD8_DIFF_MMX 1
|
||||||
|
ALIGN 16
|
||||||
|
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
|
||||||
|
; int stride, int h)
|
||||||
|
; r0 = void *s = unused, int h = unused (always 8)
|
||||||
|
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
|
||||||
|
; can simply call this 2x2x (and that's why we access rsp+gprsize
|
||||||
|
; everywhere, which is rsp of calling func
|
||||||
|
hadamard8x8_diff_%1:
|
||||||
|
lea r0, [r3*3]
|
||||||
|
|
||||||
|
; first 4x8 pixels
|
||||||
|
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
|
||||||
|
HADAMARD8
|
||||||
|
mova [rsp+gprsize+0x60], m7
|
||||||
|
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||||
|
STORE4 rsp+gprsize, m0, m1, m2, m3
|
||||||
|
mova m7, [rsp+gprsize+0x60]
|
||||||
|
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||||
|
STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
|
||||||
|
|
||||||
|
; second 4x8 pixels
|
||||||
|
DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
|
||||||
|
HADAMARD8
|
||||||
|
mova [rsp+gprsize+0x60], m7
|
||||||
|
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||||
|
STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
|
||||||
|
mova m7, [rsp+gprsize+0x60]
|
||||||
|
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||||
|
|
||||||
|
LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
|
||||||
|
HADAMARD8
|
||||||
|
ABS_SUM_8x8_32 rsp+gprsize+0x60
|
||||||
|
mova [rsp+gprsize+0x60], m0
|
||||||
|
|
||||||
|
LOAD4 rsp+gprsize , m0, m1, m2, m3
|
||||||
|
LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
|
||||||
|
HADAMARD8
|
||||||
|
ABS_SUM_8x8_32 rsp+gprsize
|
||||||
|
paddusw m0, [rsp+gprsize+0x60]
|
||||||
|
|
||||||
|
HSUM m0, m1, eax
|
||||||
|
and rax, 0xFFFF
|
||||||
|
ret
|
||||||
|
|
||||||
|
hadamard8_16_wrapper %1, 0, 14
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro HADAMARD8_DIFF_SSE2 2
|
||||||
|
hadamard8x8_diff_%1:
|
||||||
|
lea r0, [r3*3]
|
||||||
|
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
|
||||||
|
HADAMARD8
|
||||||
|
%ifdef ARCH_X86_64
|
||||||
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||||
|
%else
|
||||||
|
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
|
||||||
|
%endif
|
||||||
|
HADAMARD8
|
||||||
|
ABS_SUM_8x8 rsp+gprsize
|
||||||
|
HSUM_SSE2 m0, m1, eax
|
||||||
|
and eax, 0xFFFF
|
||||||
|
ret
|
||||||
|
|
||||||
|
hadamard8_16_wrapper %1, %2, 3
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
INIT_MMX
|
||||||
|
%define ABS1 ABS1_MMX
|
||||||
|
%define HSUM HSUM_MMX
|
||||||
|
HADAMARD8_DIFF_MMX mmx
|
||||||
|
|
||||||
|
%define ABS1 ABS1_MMX2
|
||||||
|
%define HSUM HSUM_MMX2
|
||||||
|
HADAMARD8_DIFF_MMX mmx2
|
||||||
|
|
||||||
|
INIT_XMM
|
||||||
|
%define ABS2 ABS2_MMX2
|
||||||
|
%ifdef ARCH_X86_64
|
||||||
|
%define ABS_SUM_8x8 ABS_SUM_8x8_64
|
||||||
|
%else
|
||||||
|
%define ABS_SUM_8x8 ABS_SUM_8x8_32
|
||||||
|
%endif
|
||||||
|
HADAMARD8_DIFF_SSE2 sse2, 10
|
||||||
|
|
||||||
|
%define ABS2 ABS2_SSSE3
|
||||||
|
%define ABS_SUM_8x8 ABS_SUM_8x8_64
|
||||||
|
HADAMARD8_DIFF_SSE2 ssse3, 9
|
||||||
|
|
||||||
INIT_XMM
|
INIT_XMM
|
||||||
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
|
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
|
||||||
cglobal sse16_sse2, 5, 5, 8
|
cglobal sse16_sse2, 5, 5, 8
|
||||||
|
@ -148,12 +148,30 @@
|
|||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro ABS1_MMX 2 ; a, tmp
|
%macro ABS1_MMX 2 ; a, tmp
|
||||||
|
pxor %2, %2
|
||||||
|
pcmpgtw %2, %1
|
||||||
|
pxor %1, %2
|
||||||
|
psubw %1, %2
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
|
||||||
|
pxor %3, %3
|
||||||
|
pxor %4, %4
|
||||||
|
pcmpgtw %3, %1
|
||||||
|
pcmpgtw %4, %2
|
||||||
|
pxor %1, %3
|
||||||
|
pxor %2, %4
|
||||||
|
psubw %1, %3
|
||||||
|
psubw %2, %4
|
||||||
|
%endmacro
|
||||||
|
|
||||||
|
%macro ABS1_MMX2 2 ; a, tmp
|
||||||
pxor %2, %2
|
pxor %2, %2
|
||||||
psubw %2, %1
|
psubw %2, %1
|
||||||
pmaxsw %1, %2
|
pmaxsw %1, %2
|
||||||
%endmacro
|
%endmacro
|
||||||
|
|
||||||
%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
|
%macro ABS2_MMX2 4 ; a, b, tmp0, tmp1
|
||||||
pxor %3, %3
|
pxor %3, %3
|
||||||
pxor %4, %4
|
pxor %4, %4
|
||||||
psubw %3, %1
|
psubw %3, %1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user