3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw
Originally committed as revision 9053 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
2c32b173d9
commit
038bfcf9d6
@ -2752,91 +2752,69 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ENCODERS
|
||||
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
|
||||
long i=0;
|
||||
|
||||
assert(FFABS(scale) < 256);
|
||||
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
|
||||
#define PHADDD(a, t)\
|
||||
"movq "#a", "#t" \n\t"\
|
||||
"psrlq $32, "#a" \n\t"\
|
||||
"paddd "#t", "#a" \n\t"
|
||||
/*
|
||||
pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
|
||||
pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
|
||||
pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
|
||||
*/
|
||||
#define PMULHRW(x, y, s, o)\
|
||||
"pmulhw " #s ", "#x " \n\t"\
|
||||
"pmulhw " #s ", "#y " \n\t"\
|
||||
"paddw " #o ", "#x " \n\t"\
|
||||
"paddw " #o ", "#y " \n\t"\
|
||||
"psraw $1, "#x " \n\t"\
|
||||
"psraw $1, "#y " \n\t"
|
||||
#define DEF(x) x ## _mmx
|
||||
#define SET_RND MOVQ_WONE
|
||||
#define SCALE_OFFSET 1
|
||||
|
||||
asm volatile(
|
||||
"pcmpeqw %%mm6, %%mm6 \n\t" // -1w
|
||||
"psrlw $15, %%mm6 \n\t" // 1w
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"movd %4, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
"pmulhw %%mm5, %%mm0 \n\t"
|
||||
"pmulhw %%mm5, %%mm1 \n\t"
|
||||
"paddw %%mm6, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm1 \n\t"
|
||||
"psraw $1, %%mm0 \n\t"
|
||||
"psraw $1, %%mm1 \n\t"
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"psraw $6, %%mm0 \n\t"
|
||||
"psraw $6, %%mm1 \n\t"
|
||||
"pmullw (%3, %0), %%mm0 \n\t"
|
||||
"pmullw 8(%3, %0), %%mm1 \n\t"
|
||||
"pmaddwd %%mm0, %%mm0 \n\t"
|
||||
"pmaddwd %%mm1, %%mm1 \n\t"
|
||||
"paddd %%mm1, %%mm0 \n\t"
|
||||
"psrld $4, %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
"movq %%mm7, %%mm6 \n\t"
|
||||
"psrlq $32, %%mm7 \n\t"
|
||||
"paddd %%mm6, %%mm7 \n\t"
|
||||
"psrld $2, %%mm7 \n\t"
|
||||
"movd %%mm7, %0 \n\t"
|
||||
#include "dsputil_mmx_qns.h"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
||||
);
|
||||
return i;
|
||||
}
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef SCALE_OFFSET
|
||||
#undef PMULHRW
|
||||
|
||||
static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
|
||||
long i=0;
|
||||
#define DEF(x) x ## _3dnow
|
||||
#define SET_RND(x)
|
||||
#define SCALE_OFFSET 0
|
||||
#define PMULHRW(x, y, s, o)\
|
||||
"pmulhrw " #s ", "#x " \n\t"\
|
||||
"pmulhrw " #s ", "#y " \n\t"
|
||||
|
||||
if(FFABS(scale) < 256){
|
||||
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
|
||||
asm volatile(
|
||||
"pcmpeqw %%mm6, %%mm6 \n\t" // -1w
|
||||
"psrlw $15, %%mm6 \n\t" // 1w
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
"pmulhw %%mm5, %%mm0 \n\t"
|
||||
"pmulhw %%mm5, %%mm1 \n\t"
|
||||
"paddw %%mm6, %%mm0 \n\t"
|
||||
"paddw %%mm6, %%mm1 \n\t"
|
||||
"psraw $1, %%mm0 \n\t"
|
||||
"psraw $1, %%mm1 \n\t"
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %0) \n\t"
|
||||
"movq %%mm1, 8(%2, %0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
#include "dsputil_mmx_qns.h"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef SCALE_OFFSET
|
||||
#undef PMULHRW
|
||||
|
||||
#ifdef HAVE_SSSE3
|
||||
#undef PHADDD
|
||||
#define DEF(x) x ## _ssse3
|
||||
#define SET_RND(x)
|
||||
#define SCALE_OFFSET -1
|
||||
#define PHADDD(a, t)\
|
||||
"pshufw $0x0E, "#a", "#t" \n\t"\
|
||||
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
|
||||
#define PMULHRW(x, y, s, o)\
|
||||
"pmulhrsw " #s ", "#x " \n\t"\
|
||||
"pmulhrsw " #s ", "#y " \n\t"
|
||||
|
||||
#include "dsputil_mmx_qns.h"
|
||||
|
||||
#undef DEF
|
||||
#undef SET_RND
|
||||
#undef SCALE_OFFSET
|
||||
#undef PMULHRW
|
||||
#undef PHADDD
|
||||
#endif //HAVE_SSSE3
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "g"(scale)
|
||||
);
|
||||
}else{
|
||||
for(i=0; i<8*8; i++){
|
||||
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_ENCODERS */
|
||||
|
||||
#define PREFETCH(name, op) \
|
||||
@ -3625,6 +3603,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
|
||||
#ifdef HAVE_SSSE3
|
||||
if(mm_flags & MM_SSSE3){
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->try_8x8basis= try_8x8basis_ssse3;
|
||||
}
|
||||
c->add_8x8basis= add_8x8basis_ssse3;
|
||||
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
|
||||
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
|
||||
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
|
||||
@ -3646,6 +3628,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
|
||||
#endif
|
||||
|
||||
if(mm_flags & MM_3DNOW){
|
||||
#ifdef CONFIG_ENCODERS
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
|
||||
c->try_8x8basis= try_8x8basis_3dnow;
|
||||
}
|
||||
c->add_8x8basis= add_8x8basis_3dnow;
|
||||
#endif //CONFIG_ENCODERS
|
||||
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
|
||||
c->vector_fmul = vector_fmul_3dnow;
|
||||
if(!(avctx->flags & CODEC_FLAG_BITEXACT))
|
||||
|
102
libavcodec/i386/dsputil_mmx_qns.h
Normal file
102
libavcodec/i386/dsputil_mmx_qns.h
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
* DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
|
||||
* Copyright (c) 2004 Michael Niedermayer
|
||||
*
|
||||
* This file is part of FFmpeg.
|
||||
*
|
||||
* FFmpeg is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* FFmpeg is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public
|
||||
* License along with FFmpeg; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
|
||||
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
|
||||
*/
|
||||
|
||||
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
|
||||
|
||||
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
|
||||
{
|
||||
long i=0;
|
||||
|
||||
assert(FFABS(scale) < MAX_ABS);
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
|
||||
SET_RND(mm6);
|
||||
asm volatile(
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
"movd %4, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
ASMALIGN(4)
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"psraw $6, %%mm0 \n\t"
|
||||
"psraw $6, %%mm1 \n\t"
|
||||
"pmullw (%3, %0), %%mm0 \n\t"
|
||||
"pmullw 8(%3, %0), %%mm1 \n\t"
|
||||
"pmaddwd %%mm0, %%mm0 \n\t"
|
||||
"pmaddwd %%mm1, %%mm1 \n\t"
|
||||
"paddd %%mm1, %%mm0 \n\t"
|
||||
"psrld $4, %%mm0 \n\t"
|
||||
"paddd %%mm0, %%mm7 \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" //FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
PHADDD(%%mm7, %%mm6)
|
||||
"psrld $2, %%mm7 \n\t"
|
||||
"movd %%mm7, %0 \n\t"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
|
||||
);
|
||||
return i;
|
||||
}
|
||||
|
||||
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
|
||||
{
|
||||
long i=0;
|
||||
|
||||
if(FFABS(scale) < MAX_ABS){
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
SET_RND(mm6);
|
||||
asm volatile(
|
||||
"movd %3, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
"punpcklwd %%mm5, %%mm5 \n\t"
|
||||
ASMALIGN(4)
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
"movq 8(%1, %0), %%mm1 \n\t"
|
||||
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
|
||||
"paddw (%2, %0), %%mm0 \n\t"
|
||||
"paddw 8(%2, %0), %%mm1 \n\t"
|
||||
"movq %%mm0, (%2, %0) \n\t"
|
||||
"movq %%mm1, 8(%2, %0) \n\t"
|
||||
"add $16, %0 \n\t"
|
||||
"cmp $128, %0 \n\t" // FIXME optimize & bench
|
||||
" jb 1b \n\t"
|
||||
|
||||
: "+r" (i)
|
||||
: "r"(basis), "r"(rem), "g"(scale)
|
||||
);
|
||||
}else{
|
||||
for(i=0; i<8*8; i++){
|
||||
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user