/* * MMX optimized DSP utils * Copyright (c) 2000, 2001 Fabrice Bellard. * Copyright (c) 2002-2004 Michael Niedermayer * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA * * MMX optimization by Nick Kurshev */ #include "dsputil.h" #include "simple_idct.h" #include "mpegvideo.h" #include "x86_cpu.h" #include "mmx.h" #include "vp3dsp_mmx.h" #include "vp3dsp_sse2.h" #include "h263.h" //#undef NDEBUG //#include extern void ff_idct_xvid_mmx(short *block); extern void ff_idct_xvid_mmx2(short *block); int mm_flags; /* multimedia extension flags */ /* pixel operations */ static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL; static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL; static const uint64_t ff_pdw_80000000[2] attribute_used __attribute__ ((aligned(16))) = {0x8000000080000000ULL, 0x8000000080000000ULL}; static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL; static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL; static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL; static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL; static const uint64_t ff_pw_8 attribute_used __attribute__ ((aligned(8))) = 0x0008000800080008ULL; static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL; static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL; static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL; static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL; static const uint64_t ff_pb_1 attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL; static const uint64_t ff_pb_3 attribute_used __attribute__ ((aligned(8))) = 0x0303030303030303ULL; static const uint64_t ff_pb_7 attribute_used __attribute__ ((aligned(8))) = 0x0707070707070707ULL; static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL; static const uint64_t ff_pb_A1 attribute_used __attribute__ ((aligned(8))) = 0xA1A1A1A1A1A1A1A1ULL; static const uint64_t ff_pb_5F attribute_used __attribute__ ((aligned(8))) = 0x5F5F5F5F5F5F5F5FULL; static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL; static const double ff_pd_1[2] attribute_used __attribute__ ((aligned(16))) = { 1.0, 1.0 }; static const double ff_pd_2[2] attribute_used __attribute__ ((aligned(16))) = { 2.0, 2.0 }; #define JUMPALIGN() __asm __volatile (ASMALIGN(3)::) #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::) #define MOVQ_WONE(regd) \ __asm __volatile ( \ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ "psrlw $15, %%" #regd ::) #define MOVQ_BFE(regd) \ __asm __volatile ( \ "pcmpeqd %%" #regd ", %%" #regd " \n\t"\ "paddb %%" #regd ", %%" #regd " \n\t" ::) #ifndef PIC #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone)) #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo)) #else // for shared library it's better to use this way for accessing constants // pcmpeqd -> -1 #define MOVQ_BONE(regd) \ __asm __volatile ( \ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ "psrlw $15, %%" #regd " \n\t" \ "packuswb %%" #regd ", %%" #regd " \n\t" ::) #define MOVQ_WTWO(regd) \ __asm __volatile ( \ "pcmpeqd %%" #regd ", %%" #regd " \n\t" \ "psrlw $15, %%" #regd " \n\t" \ "psllw $1, %%" #regd " \n\t"::) #endif // using regr as temporary and for the output result // first argument is unmodifed and second is trashed // regfe is supposed to contain 0xfefefefefefefefe #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ "movq " #rega ", " #regr " \n\t"\ "pand " #regb ", " #regr " \n\t"\ "pxor " #rega ", " #regb " \n\t"\ "pand " #regfe "," #regb " \n\t"\ "psrlq $1, " #regb " \n\t"\ "paddb " #regb ", " #regr " \n\t" #define PAVGB_MMX(rega, regb, regr, regfe) \ "movq " #rega ", " #regr " \n\t"\ "por " #regb ", " #regr " \n\t"\ "pxor " #rega ", " #regb " \n\t"\ "pand " #regfe "," #regb " \n\t"\ "psrlq $1, " #regb " \n\t"\ "psubb " #regb ", " #regr " \n\t" // mm6 is supposed to contain 0xfefefefefefefefe #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ "movq " #rega ", " #regr " \n\t"\ "movq " #regc ", " #regp " \n\t"\ "pand " #regb ", " #regr " \n\t"\ "pand " #regd ", " #regp " \n\t"\ "pxor " #rega ", " #regb " \n\t"\ "pxor " #regc ", " #regd " \n\t"\ "pand %%mm6, " #regb " \n\t"\ "pand %%mm6, " #regd " \n\t"\ "psrlq $1, " #regb " \n\t"\ "psrlq $1, " #regd " \n\t"\ "paddb " #regb ", " #regr " \n\t"\ "paddb " #regd ", " #regp " \n\t" #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ "movq " #rega ", " #regr " \n\t"\ "movq " #regc ", " #regp " \n\t"\ "por " #regb ", " #regr " \n\t"\ "por " #regd ", " #regp " \n\t"\ "pxor " #rega ", " #regb " \n\t"\ "pxor " #regc ", " #regd " \n\t"\ "pand %%mm6, " #regb " \n\t"\ "pand %%mm6, " #regd " \n\t"\ "psrlq $1, " #regd " \n\t"\ "psrlq $1, " #regb " \n\t"\ "psubb " #regb ", " #regr " \n\t"\ "psubb " #regd ", " #regp " \n\t" /***********************************/ /* MMX no rounding */ #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx #define SET_RND MOVQ_WONE #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) #include "dsputil_mmx_rnd.h" #undef DEF #undef SET_RND #undef PAVGBP #undef PAVGB /***********************************/ /* MMX rounding */ #define DEF(x, y) x ## _ ## y ##_mmx #define SET_RND MOVQ_WTWO #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) #include "dsputil_mmx_rnd.h" #undef DEF #undef SET_RND #undef PAVGBP #undef PAVGB /***********************************/ /* 3Dnow specific */ #define DEF(x) x ## _3dnow #define PAVGB "pavgusb" #include "dsputil_mmx_avg.h" #undef DEF #undef PAVGB /***********************************/ /* MMX2 specific */ #define DEF(x) x ## _mmx2 /* Introduced only in MMX2 set */ #define PAVGB "pavgb" #include "dsputil_mmx_avg.h" #undef DEF #undef PAVGB #define SBUTTERFLY(a,b,t,n,m)\ "mov" #m " " #a ", " #t " \n\t" /* abcd */\ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ #define TRANSPOSE4(a,b,c,d,t)\ SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ /***********************************/ /* standard MMX */ #ifdef CONFIG_ENCODERS static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size) { asm volatile( "mov $-128, %%"REG_a" \n\t" "pxor %%mm7, %%mm7 \n\t" ASMALIGN(4) "1: \n\t" "movq (%0), %%mm0 \n\t" "movq (%0, %2), %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "movq %%mm0, (%1, %%"REG_a") \n\t" "movq %%mm1, 8(%1, %%"REG_a") \n\t" "movq %%mm2, 16(%1, %%"REG_a") \n\t" "movq %%mm3, 24(%1, %%"REG_a") \n\t" "add %3, %0 \n\t" "add $32, %%"REG_a" \n\t" "js 1b \n\t" : "+r" (pixels) : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2) : "%"REG_a ); } static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride) { asm volatile( "pxor %%mm7, %%mm7 \n\t" "mov $-128, %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" "movq (%0), %%mm0 \n\t" "movq (%1), %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" "movq %%mm2, %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "psubw %%mm2, %%mm0 \n\t" "psubw %%mm3, %%mm1 \n\t" "movq %%mm0, (%2, %%"REG_a") \n\t" "movq %%mm1, 8(%2, %%"REG_a") \n\t" "add %3, %0 \n\t" "add %3, %1 \n\t" "add $16, %%"REG_a" \n\t" "jnz 1b \n\t" : "+r" (s1), "+r" (s2) : "r" (block+64), "r" ((long)stride) : "%"REG_a ); } #endif //CONFIG_ENCODERS void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; uint8_t *pix; /* read the pixels */ p = block; pix = pixels; /* unrolled loop */ __asm __volatile( "movq %3, %%mm0 \n\t" "movq 8%3, %%mm1 \n\t" "movq 16%3, %%mm2 \n\t" "movq 24%3, %%mm3 \n\t" "movq 32%3, %%mm4 \n\t" "movq 40%3, %%mm5 \n\t" "movq 48%3, %%mm6 \n\t" "movq 56%3, %%mm7 \n\t" "packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm3, %%mm2 \n\t" "packuswb %%mm5, %%mm4 \n\t" "packuswb %%mm7, %%mm6 \n\t" "movq %%mm0, (%0) \n\t" "movq %%mm2, (%0, %1) \n\t" "movq %%mm4, (%0, %1, 2) \n\t" "movq %%mm6, (%0, %2) \n\t" ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p) :"memory"); pix += line_size*4; p += 32; // if here would be an exact copy of the code above // compiler would generate some very strange code // thus using "r" __asm __volatile( "movq (%3), %%mm0 \n\t" "movq 8(%3), %%mm1 \n\t" "movq 16(%3), %%mm2 \n\t" "movq 24(%3), %%mm3 \n\t" "movq 32(%3), %%mm4 \n\t" "movq 40(%3), %%mm5 \n\t" "movq 48(%3), %%mm6 \n\t" "movq 56(%3), %%mm7 \n\t" "packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm3, %%mm2 \n\t" "packuswb %%mm5, %%mm4 \n\t" "packuswb %%mm7, %%mm6 \n\t" "movq %%mm0, (%0) \n\t" "movq %%mm2, (%0, %1) \n\t" "movq %%mm4, (%0, %1, 2) \n\t" "movq %%mm6, (%0, %2) \n\t" ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p) :"memory"); } static DECLARE_ALIGNED_8(const unsigned char, vector128[8]) = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { int i; movq_m2r(*vector128, mm1); for (i = 0; i < 8; i++) { movq_m2r(*(block), mm0); packsswb_m2r(*(block + 4), mm0); block += 8; paddb_r2r(mm1, mm0); movq_r2m(mm0, *pixels); pixels += line_size; } } void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size) { const DCTELEM *p; uint8_t *pix; int i; /* read the pixels */ p = block; pix = pixels; MOVQ_ZERO(mm7); i = 4; do { __asm __volatile( "movq (%2), %%mm0 \n\t" "movq 8(%2), %%mm1 \n\t" "movq 16(%2), %%mm2 \n\t" "movq 24(%2), %%mm3 \n\t" "movq %0, %%mm4 \n\t" "movq %1, %%mm6 \n\t" "movq %%mm4, %%mm5 \n\t" "punpcklbw %%mm7, %%mm4 \n\t" "punpckhbw %%mm7, %%mm5 \n\t" "paddsw %%mm4, %%mm0 \n\t" "paddsw %%mm5, %%mm1 \n\t" "movq %%mm6, %%mm5 \n\t" "punpcklbw %%mm7, %%mm6 \n\t" "punpckhbw %%mm7, %%mm5 \n\t" "paddsw %%mm6, %%mm2 \n\t" "paddsw %%mm5, %%mm3 \n\t" "packuswb %%mm1, %%mm0 \n\t" "packuswb %%mm3, %%mm2 \n\t" "movq %%mm0, %0 \n\t" "movq %%mm2, %1 \n\t" :"+m"(*pix), "+m"(*(pix+line_size)) :"r"(p) :"memory"); pix += line_size*2; p += 16; } while (--i); } static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" ASMALIGN(3) "1: \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" "movd %%mm0, (%2) \n\t" "movd %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movd (%1), %%mm0 \n\t" "movd (%1, %3), %%mm1 \n\t" "movd %%mm0, (%2) \n\t" "movd %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((long)line_size) : "%"REG_a, "memory" ); } static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((long)line_size) : "%"REG_a, "memory" ); } static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h) { __asm __volatile( "lea (%3, %3), %%"REG_a" \n\t" ASMALIGN(3) "1: \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "movq (%1), %%mm0 \n\t" "movq 8(%1), %%mm4 \n\t" "movq (%1, %3), %%mm1 \n\t" "movq 8(%1, %3), %%mm5 \n\t" "movq %%mm0, (%2) \n\t" "movq %%mm4, 8(%2) \n\t" "movq %%mm1, (%2, %3) \n\t" "movq %%mm5, 8(%2, %3) \n\t" "add %%"REG_a", %1 \n\t" "add %%"REG_a", %2 \n\t" "subl $4, %0 \n\t" "jnz 1b \n\t" : "+g"(h), "+r" (pixels), "+r" (block) : "r"((long)line_size) : "%"REG_a, "memory" ); } static void clear_blocks_mmx(DCTELEM *blocks) { __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "mov $-128*6, %%"REG_a" \n\t" "1: \n\t" "movq %%mm7, (%0, %%"REG_a") \n\t" "movq %%mm7, 8(%0, %%"REG_a") \n\t" "movq %%mm7, 16(%0, %%"REG_a") \n\t" "movq %%mm7, 24(%0, %%"REG_a") \n\t" "add $32, %%"REG_a" \n\t" " js 1b \n\t" : : "r" (((uint8_t *)blocks)+128*6) : "%"REG_a ); } #ifdef CONFIG_ENCODERS static int pix_sum16_mmx(uint8_t * pix, int line_size){ const int h=16; int sum; long index= -line_size*h; __asm __volatile( "pxor %%mm7, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq (%2, %1), %%mm0 \n\t" "movq (%2, %1), %%mm1 \n\t" "movq 8(%2, %1), %%mm2 \n\t" "movq 8(%2, %1), %%mm3 \n\t" "punpcklbw %%mm7, %%mm0 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" "punpcklbw %%mm7, %%mm2 \n\t" "punpckhbw %%mm7, %%mm3 \n\t" "paddw %%mm0, %%mm1 \n\t" "paddw %%mm2, %%mm3 \n\t" "paddw %%mm1, %%mm3 \n\t" "paddw %%mm3, %%mm6 \n\t" "add %3, %1 \n\t" " js 1b \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $32, %%mm6 \n\t" "paddw %%mm5, %%mm6 \n\t" "movq %%mm6, %%mm5 \n\t" "psrlq $16, %%mm6 \n\t" "paddw %%mm5, %%mm6 \n\t" "movd %%mm6, %0 \n\t" "andl $0xFFFF, %0 \n\t" : "=&r" (sum), "+r" (index) : "r" (pix - index), "r" ((long)line_size) ); return sum; } #endif //CONFIG_ENCODERS static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){ long i=0; asm volatile( "1: \n\t" "movq (%1, %0), %%mm0 \n\t" "movq (%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, (%2, %0) \n\t" "movq 8(%1, %0), %%mm0 \n\t" "movq 8(%2, %0), %%mm1 \n\t" "paddb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%2, %0) \n\t" "add $16, %0 \n\t" "cmp %3, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src), "r"(dst), "r"((long)w-15) ); for(; idsp.sse[0](c, pix1, pix2, line_size, h); else score1 = sse16_mmx(c, pix1, pix2, line_size, h); score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; else return score1 + FFABS(score2)*8; } static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { MpegEncContext *c = p; int score1= sse8_mmx(c, pix1, pix2, line_size, h); int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; else return score1 + FFABS(score2)*8; } static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { int tmp; assert( (((int)pix) & 7) == 0); assert((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0), %%mm2\n"\ "movq 8(%0), %%mm3\n"\ "add %2,%0\n"\ "movq %%mm2, " #out0 "\n"\ "movq %%mm3, " #out1 "\n"\ "psubusb " #in0 ", %%mm2\n"\ "psubusb " #in1 ", %%mm3\n"\ "psubusb " #out0 ", " #in0 "\n"\ "psubusb " #out1 ", " #in1 "\n"\ "por %%mm2, " #in0 "\n"\ "por %%mm3, " #in1 "\n"\ "movq " #in0 ", %%mm2\n"\ "movq " #in1 ", %%mm3\n"\ "punpcklbw %%mm7, " #in0 "\n"\ "punpcklbw %%mm7, " #in1 "\n"\ "punpckhbw %%mm7, %%mm2\n"\ "punpckhbw %%mm7, %%mm3\n"\ "paddw " #in1 ", " #in0 "\n"\ "paddw %%mm3, %%mm2\n"\ "paddw %%mm2, " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" asm volatile ( "movl %3,%%ecx\n" "pxor %%mm6,%%mm6\n" "pxor %%mm7,%%mm7\n" "movq (%0),%%mm0\n" "movq 8(%0),%%mm1\n" "add %2,%0\n" "subl $2, %%ecx\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" SUM(%%mm4, %%mm5, %%mm0, %%mm1) SUM(%%mm0, %%mm1, %%mm4, %%mm5) "subl $2, %%ecx\n" "jnz 1b\n" "movq %%mm6,%%mm0\n" "psrlq $32, %%mm6\n" "paddw %%mm6,%%mm0\n" "movq %%mm0,%%mm6\n" "psrlq $16, %%mm0\n" "paddw %%mm6,%%mm0\n" "movd %%mm0,%1\n" : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp & 0xFFFF; } #undef SUM static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { int tmp; assert( (((int)pix) & 7) == 0); assert((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0), " #out0 "\n"\ "movq 8(%0), " #out1 "\n"\ "add %2,%0\n"\ "psadbw " #out0 ", " #in0 "\n"\ "psadbw " #out1 ", " #in1 "\n"\ "paddw " #in1 ", " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" asm volatile ( "movl %3,%%ecx\n" "pxor %%mm6,%%mm6\n" "pxor %%mm7,%%mm7\n" "movq (%0),%%mm0\n" "movq 8(%0),%%mm1\n" "add %2,%0\n" "subl $2, %%ecx\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" SUM(%%mm4, %%mm5, %%mm0, %%mm1) SUM(%%mm0, %%mm1, %%mm4, %%mm5) "subl $2, %%ecx\n" "jnz 1b\n" "movd %%mm6,%1\n" : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp; } #undef SUM static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; assert( (((int)pix1) & 7) == 0); assert( (((int)pix2) & 7) == 0); assert((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0),%%mm2\n"\ "movq (%1)," #out0 "\n"\ "movq 8(%0),%%mm3\n"\ "movq 8(%1)," #out1 "\n"\ "add %3,%0\n"\ "add %3,%1\n"\ "psubb " #out0 ", %%mm2\n"\ "psubb " #out1 ", %%mm3\n"\ "pxor %%mm7, %%mm2\n"\ "pxor %%mm7, %%mm3\n"\ "movq %%mm2, " #out0 "\n"\ "movq %%mm3, " #out1 "\n"\ "psubusb " #in0 ", %%mm2\n"\ "psubusb " #in1 ", %%mm3\n"\ "psubusb " #out0 ", " #in0 "\n"\ "psubusb " #out1 ", " #in1 "\n"\ "por %%mm2, " #in0 "\n"\ "por %%mm3, " #in1 "\n"\ "movq " #in0 ", %%mm2\n"\ "movq " #in1 ", %%mm3\n"\ "punpcklbw %%mm7, " #in0 "\n"\ "punpcklbw %%mm7, " #in1 "\n"\ "punpckhbw %%mm7, %%mm2\n"\ "punpckhbw %%mm7, %%mm3\n"\ "paddw " #in1 ", " #in0 "\n"\ "paddw %%mm3, %%mm2\n"\ "paddw %%mm2, " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" asm volatile ( "movl %4,%%ecx\n" "pxor %%mm6,%%mm6\n" "pcmpeqw %%mm7,%%mm7\n" "psllw $15, %%mm7\n" "packsswb %%mm7, %%mm7\n" "movq (%0),%%mm0\n" "movq (%1),%%mm2\n" "movq 8(%0),%%mm1\n" "movq 8(%1),%%mm3\n" "add %3,%0\n" "add %3,%1\n" "subl $2, %%ecx\n" "psubb %%mm2, %%mm0\n" "psubb %%mm3, %%mm1\n" "pxor %%mm7, %%mm0\n" "pxor %%mm7, %%mm1\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" SUM(%%mm4, %%mm5, %%mm0, %%mm1) SUM(%%mm0, %%mm1, %%mm4, %%mm5) "subl $2, %%ecx\n" "jnz 1b\n" "movq %%mm6,%%mm0\n" "psrlq $32, %%mm6\n" "paddw %%mm6,%%mm0\n" "movq %%mm0,%%mm6\n" "psrlq $16, %%mm0\n" "paddw %%mm6,%%mm0\n" "movd %%mm0,%2\n" : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp & 0x7FFF; } #undef SUM static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { int tmp; assert( (((int)pix1) & 7) == 0); assert( (((int)pix2) & 7) == 0); assert((line_size &7) ==0); #define SUM(in0, in1, out0, out1) \ "movq (%0)," #out0 "\n"\ "movq (%1),%%mm2\n"\ "movq 8(%0)," #out1 "\n"\ "movq 8(%1),%%mm3\n"\ "add %3,%0\n"\ "add %3,%1\n"\ "psubb %%mm2, " #out0 "\n"\ "psubb %%mm3, " #out1 "\n"\ "pxor %%mm7, " #out0 "\n"\ "pxor %%mm7, " #out1 "\n"\ "psadbw " #out0 ", " #in0 "\n"\ "psadbw " #out1 ", " #in1 "\n"\ "paddw " #in1 ", " #in0 "\n"\ "paddw " #in0 ", %%mm6\n" asm volatile ( "movl %4,%%ecx\n" "pxor %%mm6,%%mm6\n" "pcmpeqw %%mm7,%%mm7\n" "psllw $15, %%mm7\n" "packsswb %%mm7, %%mm7\n" "movq (%0),%%mm0\n" "movq (%1),%%mm2\n" "movq 8(%0),%%mm1\n" "movq 8(%1),%%mm3\n" "add %3,%0\n" "add %3,%1\n" "subl $2, %%ecx\n" "psubb %%mm2, %%mm0\n" "psubb %%mm3, %%mm1\n" "pxor %%mm7, %%mm0\n" "pxor %%mm7, %%mm1\n" SUM(%%mm0, %%mm1, %%mm4, %%mm5) "1:\n" SUM(%%mm4, %%mm5, %%mm0, %%mm1) SUM(%%mm0, %%mm1, %%mm4, %%mm5) "subl $2, %%ecx\n" "jnz 1b\n" "movd %%mm6,%2\n" : "+r" (pix1), "+r" (pix2), "=r"(tmp) : "r" ((long)line_size) , "m" (h) : "%ecx"); return tmp; } #undef SUM static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){ long i=0; asm volatile( "1: \n\t" "movq (%2, %0), %%mm0 \n\t" "movq (%1, %0), %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t" "movq %%mm1, (%3, %0) \n\t" "movq 8(%2, %0), %%mm0 \n\t" "movq 8(%1, %0), %%mm1 \n\t" "psubb %%mm0, %%mm1 \n\t" "movq %%mm1, 8(%3, %0) \n\t" "add $16, %0 \n\t" "cmp %4, %0 \n\t" " jb 1b \n\t" : "+r" (i) : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15) ); for(; i 05736421 #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ SBUTTERFLY(a,b,%%xmm8,wd,dqa)\ SBUTTERFLY(c,d,b,wd,dqa)\ SBUTTERFLY(e,f,d,wd,dqa)\ SBUTTERFLY(g,h,f,wd,dqa)\ SBUTTERFLY(a,c,h,dq,dqa)\ SBUTTERFLY(%%xmm8,b,c,dq,dqa)\ SBUTTERFLY(e,g,b,dq,dqa)\ SBUTTERFLY(d,f,g,dq,dqa)\ SBUTTERFLY(a,e,f,qdq,dqa)\ SBUTTERFLY(%%xmm8,d,e,qdq,dqa)\ SBUTTERFLY(h,b,d,qdq,dqa)\ SBUTTERFLY(c,g,b,qdq,dqa)\ "movdqa %%xmm8, "#g" \n\t" #else #define TRANSPOSE8(a,b,c,d,e,f,g,h,t)\ "movdqa "#h", "#t" \n\t"\ SBUTTERFLY(a,b,h,wd,dqa)\ "movdqa "#h", 16"#t" \n\t"\ "movdqa "#t", "#h" \n\t"\ SBUTTERFLY(c,d,b,wd,dqa)\ SBUTTERFLY(e,f,d,wd,dqa)\ SBUTTERFLY(g,h,f,wd,dqa)\ SBUTTERFLY(a,c,h,dq,dqa)\ "movdqa "#h", "#t" \n\t"\ "movdqa 16"#t", "#h" \n\t"\ SBUTTERFLY(h,b,c,dq,dqa)\ SBUTTERFLY(e,g,b,dq,dqa)\ SBUTTERFLY(d,f,g,dq,dqa)\ SBUTTERFLY(a,e,f,qdq,dqa)\ SBUTTERFLY(h,d,e,qdq,dqa)\ "movdqa "#h", 16"#t" \n\t"\ "movdqa "#t", "#h" \n\t"\ SBUTTERFLY(h,b,d,qdq,dqa)\ SBUTTERFLY(c,g,b,qdq,dqa)\ "movdqa 16"#t", "#g" \n\t" #endif #define LBUTTERFLY2(a1,b1,a2,b2)\ "paddw " #b1 ", " #a1 " \n\t"\ "paddw " #b2 ", " #a2 " \n\t"\ "paddw " #b1 ", " #b1 " \n\t"\ "paddw " #b2 ", " #b2 " \n\t"\ "psubw " #a1 ", " #b1 " \n\t"\ "psubw " #a2 ", " #b2 " \n\t" #define HADAMARD8(m0, m1, m2, m3, m4, m5, m6, m7)\ LBUTTERFLY2(m0, m1, m2, m3)\ LBUTTERFLY2(m4, m5, m6, m7)\ LBUTTERFLY2(m0, m2, m1, m3)\ LBUTTERFLY2(m4, m6, m5, m7)\ LBUTTERFLY2(m0, m4, m1, m5)\ LBUTTERFLY2(m2, m6, m3, m7)\ #define HADAMARD48 HADAMARD8(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7) #define MMABS_MMX(a,z)\ "pxor " #z ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\ "pxor " #z ", " #a " \n\t"\ "psubw " #z ", " #a " \n\t" #define MMABS_MMX2(a,z)\ "pxor " #z ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\ "pmaxsw " #z ", " #a " \n\t" #define MMABS_SSSE3(a,z)\ "pabsw " #a ", " #a " \n\t" #define MMABS_SUM(a,z, sum)\ MMABS(a,z)\ "paddusw " #a ", " #sum " \n\t" #define MMABS_SUM_8x8_NOSPILL\ MMABS(%%xmm0, %%xmm8)\ MMABS(%%xmm1, %%xmm9)\ MMABS_SUM(%%xmm2, %%xmm8, %%xmm0)\ MMABS_SUM(%%xmm3, %%xmm9, %%xmm1)\ MMABS_SUM(%%xmm4, %%xmm8, %%xmm0)\ MMABS_SUM(%%xmm5, %%xmm9, %%xmm1)\ MMABS_SUM(%%xmm6, %%xmm8, %%xmm0)\ MMABS_SUM(%%xmm7, %%xmm9, %%xmm1)\ "paddusw %%xmm1, %%xmm0 \n\t" #ifdef ARCH_X86_64 #define MMABS_SUM_8x8_SSE2 MMABS_SUM_8x8_NOSPILL #else #define MMABS_SUM_8x8_SSE2\ "movdqa %%xmm7, (%1) \n\t"\ MMABS(%%xmm0, %%xmm7)\ MMABS(%%xmm1, %%xmm7)\ MMABS_SUM(%%xmm2, %%xmm7, %%xmm0)\ MMABS_SUM(%%xmm3, %%xmm7, %%xmm1)\ MMABS_SUM(%%xmm4, %%xmm7, %%xmm0)\ MMABS_SUM(%%xmm5, %%xmm7, %%xmm1)\ MMABS_SUM(%%xmm6, %%xmm7, %%xmm0)\ "movdqa (%1), %%xmm2 \n\t"\ MMABS_SUM(%%xmm2, %%xmm7, %%xmm1)\ "paddusw %%xmm1, %%xmm0 \n\t" #endif #define LOAD4(o, a, b, c, d)\ "movq "#o"(%1), "#a" \n\t"\ "movq "#o"+8(%1), "#b" \n\t"\ "movq "#o"+16(%1), "#c" \n\t"\ "movq "#o"+24(%1), "#d" \n\t"\ #define STORE4(o, a, b, c, d)\ "movq "#a", "#o"(%1) \n\t"\ "movq "#b", "#o"+8(%1) \n\t"\ "movq "#c", "#o"+16(%1) \n\t"\ "movq "#d", "#o"+24(%1) \n\t"\ /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to * about 100k on extreme inputs. But that's very unlikely to occur in natural video, * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ #define HSUM_MMX(a, t, dst)\ "movq "#a", "#t" \n\t"\ "psrlq $32, "#a" \n\t"\ "paddusw "#t", "#a" \n\t"\ "movq "#a", "#t" \n\t"\ "psrlq $16, "#a" \n\t"\ "paddusw "#t", "#a" \n\t"\ "movd "#a", "#dst" \n\t"\ #define HSUM_MMX2(a, t, dst)\ "pshufw $0x0E, "#a", "#t" \n\t"\ "paddusw "#t", "#a" \n\t"\ "pshufw $0x01, "#a", "#t" \n\t"\ "paddusw "#t", "#a" \n\t"\ "movd "#a", "#dst" \n\t"\ #define HSUM_SSE2(a, t, dst)\ "movhlps "#a", "#t" \n\t"\ "paddusw "#t", "#a" \n\t"\ "pshuflw $0x0E, "#a", "#t" \n\t"\ "paddusw "#t", "#a" \n\t"\ "pshuflw $0x01, "#a", "#t" \n\t"\ "paddusw "#t", "#a" \n\t"\ "movd "#a", "#dst" \n\t"\ #define HADAMARD8_DIFF_MMX(cpu) \ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ DECLARE_ALIGNED_8(uint64_t, temp[13]);\ int sum;\ \ assert(h==8);\ \ DIFF_PIXELS_4x8(src1, src2, stride, temp[0]);\ \ asm volatile(\ HADAMARD48\ \ "movq %%mm7, 96(%1) \n\t"\ \ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\ \ "movq 96(%1), %%mm7 \n\t"\ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\ \ : "=r" (sum)\ : "r"(temp)\ );\ \ DIFF_PIXELS_4x8(src1+4, src2+4, stride, temp[4]);\ \ asm volatile(\ HADAMARD48\ \ "movq %%mm7, 96(%1) \n\t"\ \ TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\ STORE4(32, %%mm0, %%mm3, %%mm7, %%mm2)\ \ "movq 96(%1), %%mm7 \n\t"\ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\ "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\ "movq %%mm6, %%mm7 \n\t"\ "movq %%mm0, %%mm6 \n\t"\ \ LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\ \ HADAMARD48\ "movq %%mm7, 64(%1) \n\t"\ MMABS(%%mm0, %%mm7)\ MMABS(%%mm1, %%mm7)\ MMABS_SUM(%%mm2, %%mm7, %%mm0)\ MMABS_SUM(%%mm3, %%mm7, %%mm1)\ MMABS_SUM(%%mm4, %%mm7, %%mm0)\ MMABS_SUM(%%mm5, %%mm7, %%mm1)\ MMABS_SUM(%%mm6, %%mm7, %%mm0)\ "movq 64(%1), %%mm2 \n\t"\ MMABS_SUM(%%mm2, %%mm7, %%mm1)\ "paddusw %%mm1, %%mm0 \n\t"\ "movq %%mm0, 64(%1) \n\t"\ \ LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\ LOAD4(32, %%mm4, %%mm5, %%mm6, %%mm7)\ \ HADAMARD48\ "movq %%mm7, (%1) \n\t"\ MMABS(%%mm0, %%mm7)\ MMABS(%%mm1, %%mm7)\ MMABS_SUM(%%mm2, %%mm7, %%mm0)\ MMABS_SUM(%%mm3, %%mm7, %%mm1)\ MMABS_SUM(%%mm4, %%mm7, %%mm0)\ MMABS_SUM(%%mm5, %%mm7, %%mm1)\ MMABS_SUM(%%mm6, %%mm7, %%mm0)\ "movq (%1), %%mm2 \n\t"\ MMABS_SUM(%%mm2, %%mm7, %%mm1)\ "paddusw 64(%1), %%mm0 \n\t"\ "paddusw %%mm1, %%mm0 \n\t"\ \ HSUM(%%mm0, %%mm1, %0)\ \ : "=r" (sum)\ : "r"(temp)\ );\ return sum&0xFFFF;\ }\ WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) #define HADAMARD8_DIFF_SSE2(cpu) \ static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\ DECLARE_ALIGNED_16(uint64_t, temp[4]);\ int sum;\ \ assert(h==8);\ \ DIFF_PIXELS_8x8(src1, src2, stride, temp[0]);\ \ asm volatile(\ HADAMARD8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)\ TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%1))\ HADAMARD8(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)\ MMABS_SUM_8x8\ HSUM_SSE2(%%xmm0, %%xmm1, %0)\ : "=r" (sum)\ : "r"(temp)\ );\ return sum&0xFFFF;\ }\ WARPER8_16_SQ(hadamard8_diff_##cpu, hadamard8_diff16_##cpu) #define MMABS(a,z) MMABS_MMX(a,z) #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) HADAMARD8_DIFF_MMX(mmx) #undef MMABS #undef HSUM #define MMABS(a,z) MMABS_MMX2(a,z) #define MMABS_SUM_8x8 MMABS_SUM_8x8_SSE2 #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) HADAMARD8_DIFF_MMX(mmx2) HADAMARD8_DIFF_SSE2(sse2) #undef MMABS #undef MMABS_SUM_8x8 #undef HSUM #ifdef HAVE_SSSE3 #define MMABS(a,z) MMABS_SSSE3(a,z) #define MMABS_SUM_8x8 MMABS_SUM_8x8_NOSPILL HADAMARD8_DIFF_SSE2(ssse3) #undef MMABS #undef MMABS_SUM_8x8 #endif #define DCT_SAD4(m,mm,o)\ "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ MMABS_SUM(mm##2, mm##6, mm##0)\ MMABS_SUM(mm##3, mm##7, mm##1)\ MMABS_SUM(mm##4, mm##6, mm##0)\ MMABS_SUM(mm##5, mm##7, mm##1)\ #define DCT_SAD_MMX\ "pxor %%mm0, %%mm0 \n\t"\ "pxor %%mm1, %%mm1 \n\t"\ DCT_SAD4(q, %%mm, 0)\ DCT_SAD4(q, %%mm, 8)\ DCT_SAD4(q, %%mm, 64)\ DCT_SAD4(q, %%mm, 72)\ "paddusw %%mm1, %%mm0 \n\t"\ HSUM(%%mm0, %%mm1, %0) #define DCT_SAD_SSE2\ "pxor %%xmm0, %%xmm0 \n\t"\ "pxor %%xmm1, %%xmm1 \n\t"\ DCT_SAD4(dqa, %%xmm, 0)\ DCT_SAD4(dqa, %%xmm, 64)\ "paddusw %%xmm1, %%xmm0 \n\t"\ HSUM(%%xmm0, %%xmm1, %0) #define DCT_SAD_FUNC(cpu) \ static int sum_abs_dctelem_##cpu(DCTELEM *block){\ int sum;\ asm volatile(\ DCT_SAD\ :"=r"(sum)\ :"r"(block)\ );\ return sum&0xFFFF;\ } #define DCT_SAD DCT_SAD_MMX #define HSUM(a,t,dst) HSUM_MMX(a,t,dst) #define MMABS(a,z) MMABS_MMX(a,z) DCT_SAD_FUNC(mmx) #undef MMABS #undef HSUM #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst) #define MMABS(a,z) MMABS_MMX2(a,z) DCT_SAD_FUNC(mmx2) #undef HSUM #undef DCT_SAD #define DCT_SAD DCT_SAD_SSE2 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) DCT_SAD_FUNC(sse2) #undef MMABS #ifdef HAVE_SSSE3 #define MMABS(a,z) MMABS_SSSE3(a,z) DCT_SAD_FUNC(ssse3) #undef MMABS #endif #undef HSUM #undef DCT_SAD static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ int sum; long i=size; asm volatile( "pxor %%mm4, %%mm4 \n" "1: \n" "sub $8, %0 \n" "movq (%2,%0), %%mm2 \n" "movq (%3,%0,2), %%mm0 \n" "movq 8(%3,%0,2), %%mm1 \n" "punpckhbw %%mm2, %%mm3 \n" "punpcklbw %%mm2, %%mm2 \n" "psraw $8, %%mm3 \n" "psraw $8, %%mm2 \n" "psubw %%mm3, %%mm1 \n" "psubw %%mm2, %%mm0 \n" "pmaddwd %%mm1, %%mm1 \n" "pmaddwd %%mm0, %%mm0 \n" "paddd %%mm1, %%mm4 \n" "paddd %%mm0, %%mm4 \n" "jg 1b \n" "movq %%mm4, %%mm3 \n" "psrlq $32, %%mm3 \n" "paddd %%mm3, %%mm4 \n" "movd %%mm4, %1 \n" :"+r"(i), "=r"(sum) :"r"(pix1), "r"(pix2) ); return sum; } #endif //CONFIG_ENCODERS #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d) #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d) #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ "movq "#in7", " #m3 " \n\t" /* d */\ "movq "#in0", %%mm5 \n\t" /* D */\ "paddw " #m3 ", %%mm5 \n\t" /* x4 */\ "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\ "movq "#in1", %%mm5 \n\t" /* C */\ "movq "#in2", %%mm6 \n\t" /* B */\ "paddw " #m6 ", %%mm5 \n\t" /* x3 */\ "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ "psraw $5, %%mm5 \n\t"\ "packuswb %%mm5, %%mm5 \n\t"\ OP(%%mm5, out, %%mm7, d) #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ uint64_t temp;\ \ asm volatile(\ "pxor %%mm7, %%mm7 \n\t"\ "1: \n\t"\ "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\ "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\ "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\ "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\ "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\ "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\ "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\ "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\ "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\ "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\ "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\ "psllq $24, %%mm4 \n\t" /* 000ABCDE */\ "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\ "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\ "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\ "paddw %%mm3, %%mm5 \n\t" /* b */\ "paddw %%mm2, %%mm6 \n\t" /* c */\ "paddw %%mm5, %%mm5 \n\t" /* 2b */\ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ "paddw %%mm4, %%mm0 \n\t" /* a */\ "paddw %%mm1, %%mm5 \n\t" /* d */\ "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ "paddw %6, %%mm6 \n\t"\ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm0 \n\t"\ "movq %%mm0, %5 \n\t"\ /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ \ "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\ "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\ "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\ "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\ "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\ "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\ "paddw %%mm0, %%mm2 \n\t" /* b */\ "paddw %%mm5, %%mm3 \n\t" /* c */\ "paddw %%mm2, %%mm2 \n\t" /* 2b */\ "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\ "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ "paddw %%mm2, %%mm1 \n\t" /* a */\ "paddw %%mm6, %%mm4 \n\t" /* d */\ "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ "paddw %6, %%mm1 \n\t"\ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ "psraw $5, %%mm3 \n\t"\ "movq %5, %%mm1 \n\t"\ "packuswb %%mm3, %%mm1 \n\t"\ OP_MMX2(%%mm1, (%1),%%mm4, q)\ /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ \ "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\ "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\ "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\ "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\ "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\ "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\ "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\ "paddw %%mm1, %%mm5 \n\t" /* b */\ "paddw %%mm4, %%mm0 \n\t" /* c */\ "paddw %%mm5, %%mm5 \n\t" /* 2b */\ "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ "paddw %%mm3, %%mm2 \n\t" /* d */\ "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\ "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ "paddw %%mm2, %%mm6 \n\t" /* a */\ "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ "paddw %6, %%mm0 \n\t"\ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm0 \n\t"\ /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ \ "paddw %%mm5, %%mm3 \n\t" /* a */\ "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\ "paddw %%mm4, %%mm6 \n\t" /* b */\ "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\ "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\ "paddw %%mm1, %%mm4 \n\t" /* c */\ "paddw %%mm2, %%mm5 \n\t" /* d */\ "paddw %%mm6, %%mm6 \n\t" /* 2b */\ "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ "paddw %6, %%mm4 \n\t"\ "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm4 \n\t"\ "packuswb %%mm4, %%mm0 \n\t"\ OP_MMX2(%%mm0, 8(%1), %%mm4, q)\ \ "add %3, %0 \n\t"\ "add %4, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+m"(h)\ : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ : "memory"\ );\ }\ \ static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ int i;\ int16_t temp[16];\ /* quick HACK, XXX FIXME MUST be optimized */\ for(i=0; iput_ ## postfix1 = put_ ## postfix2;\ c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\ c->avg_ ## postfix1 = avg_ ## postfix2; static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height){ const int w = 8; const int ix = ox>>(16+shift); const int iy = oy>>(16+shift); const int oxs = ox>>4; const int oys = oy>>4; const int dxxs = dxx>>4; const int dxys = dxy>>4; const int dyxs = dyx>>4; const int dyys = dyy>>4; const uint16_t r4[4] = {r,r,r,r}; const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys}; const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys}; const uint64_t shift2 = 2*shift; uint8_t edge_buf[(h+1)*stride]; int x, y; const int dxw = (dxx-(1<<(16+shift)))*(w-1); const int dyh = (dyy-(1<<(16+shift)))*(h-1); const int dxh = dxy*(h-1); const int dyw = dyx*(w-1); if( // non-constant fullpel offset (3% of blocks) (ox^(ox+dxw) | ox^(ox+dxh) | ox^(ox+dxw+dxh) | oy^(oy+dyw) | oy^(oy+dyh) | oy^(oy+dyw+dyh)) >> (16+shift) // uses more than 16 bits of subpel mv (only at huge resolution) || (dxx|dxy|dyx|dyy)&15 ) { //FIXME could still use mmx for some of the rows ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height); return; } src += ix + iy*stride; if( (unsigned)ix >= width-w || (unsigned)iy >= height-h ) { ff_emulated_edge_mc(edge_buf, src, stride, w+1, h+1, ix, iy, width, height); src = edge_buf; } asm volatile( "movd %0, %%mm6 \n\t" "pxor %%mm7, %%mm7 \n\t" "punpcklwd %%mm6, %%mm6 \n\t" "punpcklwd %%mm6, %%mm6 \n\t" :: "r"(1<0) & (a ^ sign(m))) "movq %%mm3, %1 \n\t" "movq %%mm0, %0 \n\t" :"+m"(mag[i]), "+m"(ang[i]) ::"memory" ); } asm volatile("femms"); } static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize) { int i; asm volatile( "movaps %0, %%xmm5 \n\t" ::"m"(ff_pdw_80000000[0]) ); for(i=0; i0) & (a ^ sign(m))) "movaps %%xmm3, %1 \n\t" "movaps %%xmm0, %0 \n\t" :"+m"(mag[i]), "+m"(ang[i]) ::"memory" ); } } #ifdef CONFIG_ENCODERS static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data) { double c = 2.0 / (len-1.0); int n2 = len>>1; long i = -n2*sizeof(int32_t); long j = n2*sizeof(int32_t); asm volatile( "movsd %0, %%xmm7 \n\t" "movapd %1, %%xmm6 \n\t" "movapd %2, %%xmm5 \n\t" "movlhps %%xmm7, %%xmm7 \n\t" "subpd %%xmm5, %%xmm7 \n\t" "addsd %%xmm6, %%xmm7 \n\t" ::"m"(c), "m"(*ff_pd_1), "m"(*ff_pd_2) ); #define WELCH(MOVPD)\ asm volatile(\ "1: \n\t"\ "movapd %%xmm7, %%xmm1 \n\t"\ "mulpd %%xmm1, %%xmm1 \n\t"\ "movapd %%xmm6, %%xmm0 \n\t"\ "subpd %%xmm1, %%xmm0 \n\t"\ "pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ "cvtpi2pd (%4,%0), %%xmm2 \n\t"\ "cvtpi2pd (%5,%1), %%xmm3 \n\t"\ "mulpd %%xmm0, %%xmm2 \n\t"\ "mulpd %%xmm1, %%xmm3 \n\t"\ "movapd %%xmm2, (%2,%0,2) \n\t"\ MOVPD" %%xmm3, (%3,%1,2) \n\t"\ "subpd %%xmm5, %%xmm7 \n\t"\ "sub $8, %1 \n\t"\ "add $8, %0 \n\t"\ "jl 1b \n\t"\ :"+&r"(i), "+&r"(j)\ :"r"(w_data+n2), "r"(w_data+len-2-n2),\ "r"(data+n2), "r"(data+len-2-n2)\ ); if(len&1) WELCH("movupd") else WELCH("movapd") #undef WELCH } static void flac_compute_autocorr_sse2(const int32_t *data, int len, int lag, double *autoc) { double tmp[len + lag + 2]; double *data1 = tmp + lag; int j; if((long)data1 & 15) data1++; apply_welch_window_sse2(data, len, data1); for(j=0; jdsp_mask) { if (avctx->dsp_mask & FF_MM_FORCE) mm_flags |= (avctx->dsp_mask & 0xffff); else mm_flags &= ~(avctx->dsp_mask & 0xffff); } #if 0 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:"); if (mm_flags & MM_MMX) av_log(avctx, AV_LOG_INFO, " mmx"); if (mm_flags & MM_MMXEXT) av_log(avctx, AV_LOG_INFO, " mmxext"); if (mm_flags & MM_3DNOW) av_log(avctx, AV_LOG_INFO, " 3dnow"); if (mm_flags & MM_SSE) av_log(avctx, AV_LOG_INFO, " sse"); if (mm_flags & MM_SSE2) av_log(avctx, AV_LOG_INFO, " sse2"); av_log(avctx, AV_LOG_INFO, "\n"); #endif if (mm_flags & MM_MMX) { const int idct_algo= avctx->idct_algo; #ifdef CONFIG_ENCODERS const int dct_algo = avctx->dct_algo; if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ if(mm_flags & MM_SSE2){ c->fdct = ff_fdct_sse2; }else if(mm_flags & MM_MMXEXT){ c->fdct = ff_fdct_mmx2; }else{ c->fdct = ff_fdct_mmx; } } #endif //CONFIG_ENCODERS if(avctx->lowres==0){ if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){ c->idct_put= ff_simple_idct_put_mmx; c->idct_add= ff_simple_idct_add_mmx; c->idct = ff_simple_idct_mmx; c->idct_permutation_type= FF_SIMPLE_IDCT_PERM; #ifdef CONFIG_GPL }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){ if(mm_flags & MM_MMXEXT){ c->idct_put= ff_libmpeg2mmx2_idct_put; c->idct_add= ff_libmpeg2mmx2_idct_add; c->idct = ff_mmxext_idct; }else{ c->idct_put= ff_libmpeg2mmx_idct_put; c->idct_add= ff_libmpeg2mmx_idct_add; c->idct = ff_mmx_idct; } c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM; #endif }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER) && idct_algo==FF_IDCT_VP3 && avctx->codec->id!=CODEC_ID_THEORA && !(avctx->flags & CODEC_FLAG_BITEXACT)){ if(mm_flags & MM_SSE2){ c->idct_put= ff_vp3_idct_put_sse2; c->idct_add= ff_vp3_idct_add_sse2; c->idct = ff_vp3_idct_sse2; c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; }else{ ff_vp3_dsp_init_mmx(); c->idct_put= ff_vp3_idct_put_mmx; c->idct_add= ff_vp3_idct_add_mmx; c->idct = ff_vp3_idct_mmx; c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM; } }else if(idct_algo==FF_IDCT_CAVS){ c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM; }else if(idct_algo==FF_IDCT_XVIDMMX){ if(mm_flags & MM_MMXEXT){ c->idct_put= ff_idct_xvid_mmx2_put; c->idct_add= ff_idct_xvid_mmx2_add; c->idct = ff_idct_xvid_mmx2; }else{ c->idct_put= ff_idct_xvid_mmx_put; c->idct_add= ff_idct_xvid_mmx_add; c->idct = ff_idct_xvid_mmx; } } } #ifdef CONFIG_ENCODERS c->get_pixels = get_pixels_mmx; c->diff_pixels = diff_pixels_mmx; #endif //CONFIG_ENCODERS c->put_pixels_clamped = put_pixels_clamped_mmx; c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx; c->add_pixels_clamped = add_pixels_clamped_mmx; c->clear_blocks = clear_blocks_mmx; #ifdef CONFIG_ENCODERS c->pix_sum = pix_sum16_mmx; #endif //CONFIG_ENCODERS c->put_pixels_tab[0][0] = put_pixels16_mmx; c->put_pixels_tab[0][1] = put_pixels16_x2_mmx; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx; c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx; c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx; c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx; c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx; c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx; c->avg_pixels_tab[0][0] = avg_pixels16_mmx; c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx; c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx; c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx; c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx; c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx; c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx; c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx; c->put_pixels_tab[1][0] = put_pixels8_mmx; c->put_pixels_tab[1][1] = put_pixels8_x2_mmx; c->put_pixels_tab[1][2] = put_pixels8_y2_mmx; c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx; c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx; c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx; c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx; c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx; c->avg_pixels_tab[1][0] = avg_pixels8_mmx; c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx; c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx; c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx; c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx; c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx; c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx; c->gmc= gmc_mmx; c->add_bytes= add_bytes_mmx; #ifdef CONFIG_ENCODERS c->diff_bytes= diff_bytes_mmx; c->sum_abs_dctelem= sum_abs_dctelem_mmx; c->hadamard8_diff[0]= hadamard8_diff16_mmx; c->hadamard8_diff[1]= hadamard8_diff_mmx; c->pix_norm1 = pix_norm1_mmx; c->sse[0] = (mm_flags & MM_SSE2) ? sse16_sse2 : sse16_mmx; c->sse[1] = sse8_mmx; c->vsad[4]= vsad_intra16_mmx; c->nsse[0] = nsse16_mmx; c->nsse[1] = nsse8_mmx; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->vsad[0] = vsad16_mmx; } if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->try_8x8basis= try_8x8basis_mmx; } c->add_8x8basis= add_8x8basis_mmx; c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; #endif //CONFIG_ENCODERS if (ENABLE_ANY_H263) { c->h263_v_loop_filter= h263_v_loop_filter_mmx; c->h263_h_loop_filter= h263_h_loop_filter_mmx; } c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx; c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx; c->h264_idct_dc_add= c->h264_idct_add= ff_h264_idct_add_mmx; c->h264_idct8_dc_add= c->h264_idct8_add= ff_h264_idct8_add_mmx; if (mm_flags & MM_MMXEXT) { c->prefetch = prefetch_mmx2; c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2; c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2; c->avg_pixels_tab[0][0] = avg_pixels16_mmx2; c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2; c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2; c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2; c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2; c->avg_pixels_tab[1][0] = avg_pixels8_mmx2; c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2; c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2; #ifdef CONFIG_ENCODERS c->sum_abs_dctelem= sum_abs_dctelem_mmx2; c->hadamard8_diff[0]= hadamard8_diff16_mmx2; c->hadamard8_diff[1]= hadamard8_diff_mmx2; c->vsad[4]= vsad_intra16_mmx2; #endif //CONFIG_ENCODERS c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2; c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2; c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2; c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2; c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2; c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2; #ifdef CONFIG_ENCODERS c->vsad[0] = vsad16_mmx2; #endif //CONFIG_ENCODERS } #if 1 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2) SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2) #endif //FIXME 3dnow too #define dspfunc(PFX, IDX, NUM) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \ c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \ c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \ c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \ c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \ c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \ c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \ c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \ c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \ c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \ c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \ c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \ c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \ c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \ c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \ c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2 dspfunc(put_h264_qpel, 0, 16); dspfunc(put_h264_qpel, 1, 8); dspfunc(put_h264_qpel, 2, 4); dspfunc(avg_h264_qpel, 0, 16); dspfunc(avg_h264_qpel, 1, 8); dspfunc(avg_h264_qpel, 2, 4); dspfunc(put_2tap_qpel, 0, 16); dspfunc(put_2tap_qpel, 1, 8); dspfunc(avg_2tap_qpel, 0, 16); dspfunc(avg_2tap_qpel, 1, 8); #undef dspfunc c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2; c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2; c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_mmx2; c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2; c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2; c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2; c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2; c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2; c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2; c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2; c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2; c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2; c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2; c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2; c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2; c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2; c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2; c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2; c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2; c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2; c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2; c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2; c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2; c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2; c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2; c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2; #ifdef CONFIG_CAVS_DECODER ff_cavsdsp_init_mmx2(c, avctx); #endif #ifdef CONFIG_ENCODERS c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2; #endif //CONFIG_ENCODERS } else if (mm_flags & MM_3DNOW) { c->prefetch = prefetch_3dnow; c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow; c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow; c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow; c->avg_pixels_tab[1][0] = avg_pixels8_3dnow; c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow; c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow; if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow; c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow; c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow; } SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow) SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow) #define dspfunc(PFX, IDX, NUM) \ c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \ c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \ c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \ c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \ c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \ c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \ c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \ c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \ c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \ c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \ c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \ c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \ c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \ c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \ c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \ c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow dspfunc(put_h264_qpel, 0, 16); dspfunc(put_h264_qpel, 1, 8); dspfunc(put_h264_qpel, 2, 4); dspfunc(avg_h264_qpel, 0, 16); dspfunc(avg_h264_qpel, 1, 8); dspfunc(avg_h264_qpel, 2, 4); dspfunc(put_2tap_qpel, 0, 16); dspfunc(put_2tap_qpel, 1, 8); dspfunc(avg_2tap_qpel, 0, 16); dspfunc(avg_2tap_qpel, 1, 8); c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow; c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow; } #ifdef CONFIG_ENCODERS if(mm_flags & MM_SSE2){ c->sum_abs_dctelem= sum_abs_dctelem_sse2; c->hadamard8_diff[0]= hadamard8_diff16_sse2; c->hadamard8_diff[1]= hadamard8_diff_sse2; c->flac_compute_autocorr = flac_compute_autocorr_sse2; } #ifdef HAVE_SSSE3 if(mm_flags & MM_SSSE3){ if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->try_8x8basis= try_8x8basis_ssse3; } c->add_8x8basis= add_8x8basis_ssse3; c->sum_abs_dctelem= sum_abs_dctelem_ssse3; c->hadamard8_diff[0]= hadamard8_diff16_ssse3; c->hadamard8_diff[1]= hadamard8_diff_ssse3; } #endif #endif #ifdef CONFIG_SNOW_DECODER if(mm_flags & MM_SSE2 & 0){ c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; #ifdef HAVE_7REGS c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; #endif c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; } else{ if(mm_flags & MM_MMXEXT){ c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; #ifdef HAVE_7REGS c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; #endif } c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; } #endif if(mm_flags & MM_3DNOW){ #ifdef CONFIG_ENCODERS if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ c->try_8x8basis= try_8x8basis_3dnow; } c->add_8x8basis= add_8x8basis_3dnow; #endif //CONFIG_ENCODERS c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow; c->vector_fmul = vector_fmul_3dnow; if(!(avctx->flags & CODEC_FLAG_BITEXACT)) c->float_to_int16 = float_to_int16_3dnow; } if(mm_flags & MM_3DNOWEXT) c->vector_fmul_reverse = vector_fmul_reverse_3dnow2; if(mm_flags & MM_SSE){ c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse; c->vector_fmul = vector_fmul_sse; c->float_to_int16 = float_to_int16_sse; c->vector_fmul_reverse = vector_fmul_reverse_sse; c->vector_fmul_add_add = vector_fmul_add_add_sse; } if(mm_flags & MM_3DNOW) c->vector_fmul_add_add = vector_fmul_add_add_3dnow; // faster than sse } #ifdef CONFIG_ENCODERS dsputil_init_pix_mmx(c, avctx); #endif //CONFIG_ENCODERS #if 0 // for speed testing get_pixels = just_return; put_pixels_clamped = just_return; add_pixels_clamped = just_return; pix_abs16x16 = just_return; pix_abs16x16_x2 = just_return; pix_abs16x16_y2 = just_return; pix_abs16x16_xy2 = just_return; put_pixels_tab[0] = just_return; put_pixels_tab[1] = just_return; put_pixels_tab[2] = just_return; put_pixels_tab[3] = just_return; put_no_rnd_pixels_tab[0] = just_return; put_no_rnd_pixels_tab[1] = just_return; put_no_rnd_pixels_tab[2] = just_return; put_no_rnd_pixels_tab[3] = just_return; avg_pixels_tab[0] = just_return; avg_pixels_tab[1] = just_return; avg_pixels_tab[2] = just_return; avg_pixels_tab[3] = just_return; avg_no_rnd_pixels_tab[0] = just_return; avg_no_rnd_pixels_tab[1] = just_return; avg_no_rnd_pixels_tab[2] = just_return; avg_no_rnd_pixels_tab[3] = just_return; //av_fdct = just_return; //ff_idct = just_return; #endif }