Move hadamard_diff{,16}_{mmx,mmx2,sse2,ssse3}() from inline asm to yasm,
which will hopefully solve the Win64/FATE failures caused by these functions. Originally committed as revision 25137 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
d0acc2d2e9
commit
e2e341048e
@ -879,55 +879,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
|
||||
*left = src2[w-1];
|
||||
}
|
||||
|
||||
#define DIFF_PIXELS_1(m,a,t,p1,p2)\
|
||||
"mov"#m" "#p1", "#a" \n\t"\
|
||||
"mov"#m" "#p2", "#t" \n\t"\
|
||||
"punpcklbw "#a", "#t" \n\t"\
|
||||
"punpcklbw "#a", "#a" \n\t"\
|
||||
"psubw "#t", "#a" \n\t"\
|
||||
|
||||
#define DIFF_PIXELS_8(m0,m1,mm,p1,p2,stride,temp) {\
|
||||
uint8_t *p1b=p1, *p2b=p2;\
|
||||
__asm__ volatile(\
|
||||
DIFF_PIXELS_1(m0, mm##0, mm##7, (%1), (%2))\
|
||||
DIFF_PIXELS_1(m0, mm##1, mm##7, (%1,%3), (%2,%3))\
|
||||
DIFF_PIXELS_1(m0, mm##2, mm##7, (%1,%3,2), (%2,%3,2))\
|
||||
"add %4, %1 \n\t"\
|
||||
"add %4, %2 \n\t"\
|
||||
DIFF_PIXELS_1(m0, mm##3, mm##7, (%1), (%2))\
|
||||
DIFF_PIXELS_1(m0, mm##4, mm##7, (%1,%3), (%2,%3))\
|
||||
DIFF_PIXELS_1(m0, mm##5, mm##7, (%1,%3,2), (%2,%3,2))\
|
||||
DIFF_PIXELS_1(m0, mm##6, mm##7, (%1,%4), (%2,%4))\
|
||||
"mov"#m1" "#mm"0, %0 \n\t"\
|
||||
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
|
||||
"mov"#m1" %0, "#mm"0 \n\t"\
|
||||
: "+m"(temp), "+r"(p1b), "+r"(p2b)\
|
||||
: "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
|
||||
);\
|
||||
}
|
||||
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
|
||||
|
||||
#define DIFF_PIXELS_4x8(p1,p2,stride,temp) DIFF_PIXELS_8(d, q, %%mm, p1, p2, stride, temp)
|
||||
#define DIFF_PIXELS_8x8(p1,p2,stride,temp) DIFF_PIXELS_8(q, dqa, %%xmm, p1, p2, stride, temp)
|
||||
|
||||
#define LBUTTERFLY2(a1,b1,a2,b2)\
|
||||
"paddw " #b1 ", " #a1 " \n\t"\
|
||||
"paddw " #b2 ", " #a2 " \n\t"\
|
||||
"paddw " #b1 ", " #b1 " \n\t"\
|
||||
"paddw " #b2 ", " #b2 " \n\t"\
|
||||
"psubw " #a1 ", " #b1 " \n\t"\
|
||||
"psubw " #a2 ", " #b2 " \n\t"
|
||||
|
||||
#define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\
|
||||
LBUTTERFLY2(m0, m1, m2, m3)\
|
||||
LBUTTERFLY2(m4, m5, m6, m7)\
|
||||
LBUTTERFLY2(m0, m2, m1, m3)\
|
||||
LBUTTERFLY2(m4, m6, m5, m7)\
|
||||
LBUTTERFLY2(m0, m4, m1, m5)\
|
||||
LBUTTERFLY2(m2, m6, m3, m7)\
|
||||
|
||||
#define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
|
||||
|
||||
#define MMABS_MMX(a,z)\
|
||||
"pxor " #z ", " #z " \n\t"\
|
||||
"pcmpgtw " #a ", " #z " \n\t"\
|
||||
@ -946,34 +897,6 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
|
||||
MMABS(a,z)\
|
||||
"paddusw " #a ", " #sum " \n\t"
|
||||
|
||||
#define MMABS_SUM_8x8_NOSPILL\
|
||||
MMABS(%%xmm0, %%xmm8)\
|
||||
MMABS(%%xmm1, %%xmm9)\
|
||||
MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\
|
||||
MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\
|
||||
MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\
|
||||
MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\
|
||||
MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\
|
||||
MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\
|
||||
"paddusw %%xmm1, %%xmm0 \n\t"
|
||||
|
||||
#if ARCH_X86_64
|
||||
#define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL
|
||||
#else
|
||||
#define MMABS_SUM_8x8_SSE2\
|
||||
"movdqa %%xmm7, (%1) \n\t"\
|
||||
MMABS(%%xmm0, %%xmm7)\
|
||||
MMABS(%%xmm1, %%xmm7)\
|
||||
MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\
|
||||
MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\
|
||||
MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\
|
||||
MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\
|
||||
MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\
|
||||
"movdqa (%1), %%xmm2 \n\t"\
|
||||
MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\
|
||||
"paddusw %%xmm1, %%xmm0 \n\t"
|
||||
#endif
|
||||
|
||||
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
||||
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
||||
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
|
||||
@ -1002,133 +925,16 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, c
|
||||
"paddusw "#t", "#a" \n\t"\
|
||||
"movd "#a", "#dst" \n\t"\
|
||||
|
||||
#define HADAMARD8_DIFF_MMX(cpu) \
|
||||
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
|
||||
DECLARE_ALIGNED(8, uint64_t, temp)[13];\
|
||||
int sum;\
|
||||
\
|
||||
assert(h==8);\
|
||||
\
|
||||
DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\
|
||||
\
|
||||
__asm__ volatile(\
|
||||
HADAMARD48\
|
||||
\
|
||||
"movq %%mm7, 96(%1) \n\t"\
|
||||
\
|
||||
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
|
||||
STORE4(8, 0(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
|
||||
\
|
||||
"movq 96(%1), %%mm7 \n\t"\
|
||||
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
|
||||
STORE4(8, 64(%1), %%mm4, %%mm7, %%mm0, %%mm6)\
|
||||
\
|
||||
: "=r" (sum)\
|
||||
: "r"(temp)\
|
||||
);\
|
||||
\
|
||||
DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\
|
||||
\
|
||||
__asm__ volatile(\
|
||||
HADAMARD48\
|
||||
\
|
||||
"movq %%mm7, 96(%1) \n\t"\
|
||||
\
|
||||
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
|
||||
STORE4(8, 32(%1), %%mm0, %%mm3, %%mm7, %%mm2)\
|
||||
\
|
||||
"movq 96(%1), %%mm7 \n\t"\
|
||||
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
|
||||
"movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
|
||||
"movq %%mm6, %%mm7 \n\t"\
|
||||
"movq %%mm0, %%mm6 \n\t"\
|
||||
\
|
||||
LOAD4(8, 64(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
|
||||
\
|
||||
HADAMARD48\
|
||||
"movq %%mm7, 64(%1) \n\t"\
|
||||
MMABS(%%mm0, %%mm7)\
|
||||
MMABS(%%mm1, %%mm7)\
|
||||
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
|
||||
MMABS_SUM(%%mm3, %%mm7, %%mm1)\
|
||||
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
|
||||
MMABS_SUM(%%mm5, %%mm7, %%mm1)\
|
||||
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
|
||||
"movq 64(%1), %%mm2 \n\t"\
|
||||
MMABS_SUM(%%mm2, %%mm7, %%mm1)\
|
||||
"paddusw %%mm1, %%mm0 \n\t"\
|
||||
"movq %%mm0, 64(%1) \n\t"\
|
||||
\
|
||||
LOAD4(8, 0(%1), %%mm0, %%mm1, %%mm2, %%mm3)\
|
||||
LOAD4(8, 32(%1), %%mm4, %%mm5, %%mm6, %%mm7)\
|
||||
\
|
||||
HADAMARD48\
|
||||
"movq %%mm7, (%1) \n\t"\
|
||||
MMABS(%%mm0, %%mm7)\
|
||||
MMABS(%%mm1, %%mm7)\
|
||||
MMABS_SUM(%%mm2, %%mm7, %%mm0)\
|
||||
MMABS_SUM(%%mm3, %%mm7, %%mm1)\
|
||||
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
|
||||
MMABS_SUM(%%mm5, %%mm7, %%mm1)\
|
||||
MMABS_SUM(%%mm6, %%mm7, %%mm0)\
|
||||
"movq (%1), %%mm2 \n\t"\
|
||||
MMABS_SUM(%%mm2, %%mm7, %%mm1)\
|
||||
"paddusw 64(%1), %%mm0 \n\t"\
|
||||
"paddusw %%mm1, %%mm0 \n\t"\
|
||||
\
|
||||
HSUM(%%mm0, %%mm1, %0)\
|
||||
\
|
||||
: "=r" (sum)\
|
||||
: "r"(temp)\
|
||||
);\
|
||||
return sum&0xFFFF;\
|
||||
}\
|
||||
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
|
||||
#define hadamard_func(cpu) \
|
||||
int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
|
||||
int stride, int h); \
|
||||
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
|
||||
int stride, int h);
|
||||
|
||||
#define HADAMARD8_DIFF_SSE2(cpu) \
|
||||
static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
|
||||
DECLARE_ALIGNED(16, uint64_t, temp)[4];\
|
||||
int sum;\
|
||||
\
|
||||
assert(h==8);\
|
||||
\
|
||||
DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\
|
||||
\
|
||||
__asm__ volatile(\
|
||||
HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\
|
||||
TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\
|
||||
HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\
|
||||
MMABS_SUM_8x8\
|
||||
HSUM_SSE2(%%xmm0, %%xmm1, %0)\
|
||||
: "=r" (sum)\
|
||||
: "r"(temp)\
|
||||
);\
|
||||
return sum&0xFFFF;\
|
||||
}\
|
||||
WRAPPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu)
|
||||
|
||||
#define MMABS(a,z) MMABS_MMX(a,z)
|
||||
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
|
||||
HADAMARD8_DIFF_MMX(mmx)
|
||||
#undef MMABS
|
||||
#undef HSUM
|
||||
|
||||
#define MMABS(a,z) MMABS_MMX2(a,z)
|
||||
#define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2
|
||||
#define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
|
||||
HADAMARD8_DIFF_MMX(mmx2)
|
||||
HADAMARD8_DIFF_SSE2(sse2)
|
||||
#undef MMABS
|
||||
#undef MMABS_SUM_8x8
|
||||
#undef HSUM
|
||||
|
||||
#if HAVE_SSSE3
|
||||
#define MMABS(a,z) MMABS_SSSE3(a,z)
|
||||
#define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL
|
||||
HADAMARD8_DIFF_SSE2(ssse3)
|
||||
#undef MMABS
|
||||
#undef MMABS_SUM_8x8
|
||||
#endif
|
||||
hadamard_func(mmx)
|
||||
hadamard_func(mmx2)
|
||||
hadamard_func(sse2)
|
||||
hadamard_func(ssse3)
|
||||
|
||||
#define DCT_SAD4(m,mm,o)\
|
||||
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
|
||||
@ -1312,8 +1118,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
c->diff_bytes= diff_bytes_mmx;
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
|
||||
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_mmx;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_mmx;
|
||||
c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx;
|
||||
c->hadamard8_diff[1]= ff_hadamard8_diff_mmx;
|
||||
|
||||
c->pix_norm1 = pix_norm1_mmx;
|
||||
c->sse[0] = (mm_flags & AV_CPU_FLAG_SSE2) ? ff_sse16_sse2 : sse16_mmx;
|
||||
@ -1336,8 +1142,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
|
||||
if (mm_flags & AV_CPU_FLAG_MMX2) {
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_mmx2;
|
||||
c->hadamard8_diff[0]= ff_hadamard8_diff16_mmx2;
|
||||
c->hadamard8_diff[1]= ff_hadamard8_diff_mmx2;
|
||||
c->vsad[4]= vsad_intra16_mmx2;
|
||||
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
@ -1350,8 +1156,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
if(mm_flags & AV_CPU_FLAG_SSE2){
|
||||
c->get_pixels = get_pixels_sse2;
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_sse2;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_sse2;
|
||||
c->hadamard8_diff[0]= ff_hadamard8_diff16_sse2;
|
||||
c->hadamard8_diff[1]= ff_hadamard8_diff_sse2;
|
||||
}
|
||||
|
||||
if (CONFIG_LPC && mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
|
||||
@ -1365,8 +1171,8 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
}
|
||||
c->add_8x8basis= add_8x8basis_ssse3;
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
|
||||
c->hadamard8_diff[0]= ff_hadamard8_diff16_ssse3;
|
||||
c->hadamard8_diff[1]= ff_hadamard8_diff_ssse3;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -26,6 +26,261 @@
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro DIFF_PIXELS_1 4
|
||||
movh %1, %3
|
||||
movh %2, %4
|
||||
punpcklbw %2, %1
|
||||
punpcklbw %1, %1
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
|
||||
; %6=temporary storage location
|
||||
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
|
||||
%macro DIFF_PIXELS_8 6
|
||||
DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
|
||||
DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
|
||||
DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
||||
add %1, %5
|
||||
add %2, %5
|
||||
DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
|
||||
DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
|
||||
DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
|
||||
DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
|
||||
%ifdef m8
|
||||
DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
|
||||
%else
|
||||
mova [%6], m0
|
||||
DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
|
||||
mova m0, [%6]
|
||||
%endif
|
||||
sub %1, %5
|
||||
sub %2, %5
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8 0
|
||||
SUMSUB_BADC m0, m1, m2, m3
|
||||
SUMSUB_BADC m4, m5, m6, m7
|
||||
SUMSUB_BADC m0, m2, m1, m3
|
||||
SUMSUB_BADC m4, m6, m5, m7
|
||||
SUMSUB_BADC m0, m4, m1, m5
|
||||
SUMSUB_BADC m2, m6, m3, m7
|
||||
%endmacro
|
||||
|
||||
%macro ABS1_SUM 3
|
||||
ABS1 %1, %2
|
||||
paddusw %3, %1
|
||||
%endmacro
|
||||
|
||||
%macro ABS2_SUM 6
|
||||
ABS2 %1, %2, %3, %4
|
||||
paddusw %5, %1
|
||||
paddusw %6, %2
|
||||
%endmacro
|
||||
|
||||
%macro ABS_SUM_8x8_64 1
|
||||
ABS2 m0, m1, m8, m9
|
||||
ABS2_SUM m2, m3, m8, m9, m0, m1
|
||||
ABS2_SUM m4, m5, m8, m9, m0, m1
|
||||
ABS2_SUM m6, m7, m8, m9, m0, m1
|
||||
paddusw m0, m1
|
||||
%endmacro
|
||||
|
||||
%macro ABS_SUM_8x8_32 1
|
||||
mova [%1], m7
|
||||
ABS1 m0, m7
|
||||
ABS1 m1, m7
|
||||
ABS1_SUM m2, m7, m0
|
||||
ABS1_SUM m3, m7, m1
|
||||
ABS1_SUM m4, m7, m0
|
||||
ABS1_SUM m5, m7, m1
|
||||
ABS1_SUM m6, m7, m0
|
||||
mova m2, [%1]
|
||||
ABS1_SUM m2, m7, m1
|
||||
paddusw m0, m1
|
||||
%endmacro
|
||||
|
||||
; FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
|
||||
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
|
||||
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
|
||||
%macro HSUM_MMX 3
|
||||
mova %2, %1
|
||||
psrlq %1, 32
|
||||
paddusw %1, %2
|
||||
mova %2, %1
|
||||
psrlq %1, 16
|
||||
paddusw %1, %2
|
||||
movd %3, %1
|
||||
%endmacro
|
||||
|
||||
%macro HSUM_MMX2 3
|
||||
pshufw %2, %1, 0xE
|
||||
paddusw %1, %2
|
||||
pshufw %2, %1, 0x1
|
||||
paddusw %1, %2
|
||||
movd %3, %1
|
||||
%endmacro
|
||||
|
||||
%macro HSUM_SSE2 3
|
||||
movhlps %2, %1
|
||||
paddusw %1, %2
|
||||
pshuflw %2, %1, 0xE
|
||||
paddusw %1, %2
|
||||
pshuflw %2, %1, 0x1
|
||||
paddusw %1, %2
|
||||
movd %3, %1
|
||||
%endmacro
|
||||
|
||||
%macro STORE4 5
|
||||
mova [%1+mmsize*0], %2
|
||||
mova [%1+mmsize*1], %3
|
||||
mova [%1+mmsize*2], %4
|
||||
mova [%1+mmsize*3], %5
|
||||
%endmacro
|
||||
|
||||
%macro LOAD4 5
|
||||
mova %2, [%1+mmsize*0]
|
||||
mova %3, [%1+mmsize*1]
|
||||
mova %4, [%1+mmsize*2]
|
||||
mova %5, [%1+mmsize*3]
|
||||
%endmacro
|
||||
|
||||
%macro hadamard8_16_wrapper 3
|
||||
cglobal hadamard8_diff_%1, 4, 4, %2
|
||||
%ifndef m8
|
||||
%assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
|
||||
SUB rsp, pad
|
||||
%endif
|
||||
call hadamard8x8_diff_%1
|
||||
%ifndef m8
|
||||
ADD rsp, pad
|
||||
%endif
|
||||
RET
|
||||
|
||||
cglobal hadamard8_diff16_%1, 5, 6, %2
|
||||
%ifndef m8
|
||||
%assign pad %3*mmsize-(4+stack_offset&(mmsize-1))
|
||||
SUB rsp, pad
|
||||
%endif
|
||||
|
||||
call hadamard8x8_diff_%1
|
||||
mov r5d, eax
|
||||
|
||||
add r1, 8
|
||||
add r2, 8
|
||||
call hadamard8x8_diff_%1
|
||||
add r5d, eax
|
||||
|
||||
cmp r4d, 16
|
||||
jne .done
|
||||
|
||||
lea r1, [r1+r3*8-8]
|
||||
lea r2, [r2+r3*8-8]
|
||||
call hadamard8x8_diff_%1
|
||||
add r5d, eax
|
||||
|
||||
add r1, 8
|
||||
add r2, 8
|
||||
call hadamard8x8_diff_%1
|
||||
add r5d, eax
|
||||
|
||||
.done
|
||||
mov eax, r5d
|
||||
%ifndef m8
|
||||
ADD rsp, pad
|
||||
%endif
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_DIFF_MMX 1
|
||||
ALIGN 16
|
||||
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
|
||||
; int stride, int h)
|
||||
; r0 = void *s = unused, int h = unused (always 8)
|
||||
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
|
||||
; can simply call this 2x2x (and that's why we access rsp+gprsize
|
||||
; everywhere, which is rsp of calling func
|
||||
hadamard8x8_diff_%1:
|
||||
lea r0, [r3*3]
|
||||
|
||||
; first 4x8 pixels
|
||||
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
|
||||
HADAMARD8
|
||||
mova [rsp+gprsize+0x60], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
STORE4 rsp+gprsize, m0, m1, m2, m3
|
||||
mova m7, [rsp+gprsize+0x60]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
|
||||
|
||||
; second 4x8 pixels
|
||||
DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
|
||||
HADAMARD8
|
||||
mova [rsp+gprsize+0x60], m7
|
||||
TRANSPOSE4x4W 0, 1, 2, 3, 7
|
||||
STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
|
||||
mova m7, [rsp+gprsize+0x60]
|
||||
TRANSPOSE4x4W 4, 5, 6, 7, 0
|
||||
|
||||
LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
|
||||
HADAMARD8
|
||||
ABS_SUM_8x8_32 rsp+gprsize+0x60
|
||||
mova [rsp+gprsize+0x60], m0
|
||||
|
||||
LOAD4 rsp+gprsize , m0, m1, m2, m3
|
||||
LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
|
||||
HADAMARD8
|
||||
ABS_SUM_8x8_32 rsp+gprsize
|
||||
paddusw m0, [rsp+gprsize+0x60]
|
||||
|
||||
HSUM m0, m1, eax
|
||||
and rax, 0xFFFF
|
||||
ret
|
||||
|
||||
hadamard8_16_wrapper %1, 0, 14
|
||||
%endmacro
|
||||
|
||||
%macro HADAMARD8_DIFF_SSE2 2
|
||||
hadamard8x8_diff_%1:
|
||||
lea r0, [r3*3]
|
||||
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
|
||||
HADAMARD8
|
||||
%ifdef ARCH_X86_64
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
|
||||
%else
|
||||
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
|
||||
%endif
|
||||
HADAMARD8
|
||||
ABS_SUM_8x8 rsp+gprsize
|
||||
HSUM_SSE2 m0, m1, eax
|
||||
and eax, 0xFFFF
|
||||
ret
|
||||
|
||||
hadamard8_16_wrapper %1, %2, 3
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
%define ABS1 ABS1_MMX
|
||||
%define HSUM HSUM_MMX
|
||||
HADAMARD8_DIFF_MMX mmx
|
||||
|
||||
%define ABS1 ABS1_MMX2
|
||||
%define HSUM HSUM_MMX2
|
||||
HADAMARD8_DIFF_MMX mmx2
|
||||
|
||||
INIT_XMM
|
||||
%define ABS2 ABS2_MMX2
|
||||
%ifdef ARCH_X86_64
|
||||
%define ABS_SUM_8x8 ABS_SUM_8x8_64
|
||||
%else
|
||||
%define ABS_SUM_8x8 ABS_SUM_8x8_32
|
||||
%endif
|
||||
HADAMARD8_DIFF_SSE2 sse2, 10
|
||||
|
||||
%define ABS2 ABS2_SSSE3
|
||||
%define ABS_SUM_8x8 ABS_SUM_8x8_64
|
||||
HADAMARD8_DIFF_SSE2 ssse3, 9
|
||||
|
||||
INIT_XMM
|
||||
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
|
||||
cglobal sse16_sse2, 5, 5, 8
|
||||
|
@ -148,12 +148,30 @@
|
||||
%endmacro
|
||||
|
||||
%macro ABS1_MMX 2 ; a, tmp
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
|
||||
pxor %3, %3
|
||||
pxor %4, %4
|
||||
pcmpgtw %3, %1
|
||||
pcmpgtw %4, %2
|
||||
pxor %1, %3
|
||||
pxor %2, %4
|
||||
psubw %1, %3
|
||||
psubw %2, %4
|
||||
%endmacro
|
||||
|
||||
%macro ABS1_MMX2 2 ; a, tmp
|
||||
pxor %2, %2
|
||||
psubw %2, %1
|
||||
pmaxsw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro ABS2_MMX 4 ; a, b, tmp0, tmp1
|
||||
%macro ABS2_MMX2 4 ; a, b, tmp0, tmp1
|
||||
pxor %3, %3
|
||||
pxor %4, %4
|
||||
psubw %3, %1
|
||||
|
Loading…
x
Reference in New Issue
Block a user