3DNow! and SSSE3 optimization to QNS DSP functions; use pmulhrw/pmulhrsw instead of pmulhw

Originally committed as revision 9053 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Zuxy Meng 2007-05-18 08:18:56 +00:00
parent 2c32b173d9
commit 038bfcf9d6
2 changed files with 169 additions and 79 deletions

View File

@ -2752,91 +2752,69 @@ static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int o
}
#ifdef CONFIG_ENCODERS
static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
long i=0;
assert(FFABS(scale) < 256);
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
#define PHADDD(a, t)\
"movq "#a", "#t" \n\t"\
"psrlq $32, "#a" \n\t"\
"paddd "#t", "#a" \n\t"
/*
pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
*/
#define PMULHRW(x, y, s, o)\
"pmulhw " #s ", "#x " \n\t"\
"pmulhw " #s ", "#y " \n\t"\
"paddw " #o ", "#x " \n\t"\
"paddw " #o ", "#y " \n\t"\
"psraw $1, "#x " \n\t"\
"psraw $1, "#y " \n\t"
#define DEF(x) x ## _mmx
#define SET_RND MOVQ_WONE
#define SCALE_OFFSET 1
asm volatile(
"pcmpeqw %%mm6, %%mm6 \n\t" // -1w
"psrlw $15, %%mm6 \n\t" // 1w
"pxor %%mm7, %%mm7 \n\t"
"movd %4, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"pmulhw %%mm5, %%mm0 \n\t"
"pmulhw %%mm5, %%mm1 \n\t"
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm6, %%mm1 \n\t"
"psraw $1, %%mm0 \n\t"
"psraw $1, %%mm1 \n\t"
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"psraw $6, %%mm0 \n\t"
"psraw $6, %%mm1 \n\t"
"pmullw (%3, %0), %%mm0 \n\t"
"pmullw 8(%3, %0), %%mm1 \n\t"
"pmaddwd %%mm0, %%mm0 \n\t"
"pmaddwd %%mm1, %%mm1 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"psrld $4, %%mm0 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"psrld $2, %%mm7 \n\t"
"movd %%mm7, %0 \n\t"
#include "dsputil_mmx_qns.h"
: "+r" (i)
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
);
return i;
}
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
long i=0;
#define DEF(x) x ## _3dnow
#define SET_RND(x)
#define SCALE_OFFSET 0
#define PMULHRW(x, y, s, o)\
"pmulhrw " #s ", "#x " \n\t"\
"pmulhrw " #s ", "#y " \n\t"
if(FFABS(scale) < 256){
scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
asm volatile(
"pcmpeqw %%mm6, %%mm6 \n\t" // -1w
"psrlw $15, %%mm6 \n\t" // 1w
"movd %3, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"pmulhw %%mm5, %%mm0 \n\t"
"pmulhw %%mm5, %%mm1 \n\t"
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm6, %%mm1 \n\t"
"psraw $1, %%mm0 \n\t"
"psraw $1, %%mm1 \n\t"
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"movq %%mm0, (%2, %0) \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t"
#include "dsputil_mmx_qns.h"
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
#ifdef HAVE_SSSE3
#undef PHADDD
#define DEF(x) x ## _ssse3
#define SET_RND(x)
#define SCALE_OFFSET -1
#define PHADDD(a, t)\
"pshufw $0x0E, "#a", "#t" \n\t"\
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
#define PMULHRW(x, y, s, o)\
"pmulhrsw " #s ", "#x " \n\t"\
"pmulhrsw " #s ", "#y " \n\t"
#include "dsputil_mmx_qns.h"
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
#undef PHADDD
#endif //HAVE_SSSE3
: "+r" (i)
: "r"(basis), "r"(rem), "g"(scale)
);
}else{
for(i=0; i<8*8; i++){
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
}
}
}
#endif /* CONFIG_ENCODERS */
#define PREFETCH(name, op) \
@ -3625,6 +3603,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#ifdef HAVE_SSSE3
if(mm_flags & MM_SSSE3){
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->try_8x8basis= try_8x8basis_ssse3;
}
c->add_8x8basis= add_8x8basis_ssse3;
c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
c->hadamard8_diff[0]= hadamard8_diff16_ssse3;
c->hadamard8_diff[1]= hadamard8_diff_ssse3;
@ -3646,6 +3628,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
#endif
if(mm_flags & MM_3DNOW){
#ifdef CONFIG_ENCODERS
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->try_8x8basis= try_8x8basis_3dnow;
}
c->add_8x8basis= add_8x8basis_3dnow;
#endif //CONFIG_ENCODERS
c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
c->vector_fmul = vector_fmul_3dnow;
if(!(avctx->flags & CODEC_FLAG_BITEXACT))

View File

@ -0,0 +1,102 @@
/*
* DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
* Copyright (c) 2004 Michael Niedermayer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
*/
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
{
long i=0;
assert(FFABS(scale) < MAX_ABS);
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movd %4, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
ASMALIGN(4)
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"psraw $6, %%mm0 \n\t"
"psraw $6, %%mm1 \n\t"
"pmullw (%3, %0), %%mm0 \n\t"
"pmullw 8(%3, %0), %%mm1 \n\t"
"pmaddwd %%mm0, %%mm0 \n\t"
"pmaddwd %%mm1, %%mm1 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"psrld $4, %%mm0 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t"
PHADDD(%%mm7, %%mm6)
"psrld $2, %%mm7 \n\t"
"movd %%mm7, %0 \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
);
return i;
}
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
{
long i=0;
if(FFABS(scale) < MAX_ABS){
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
asm volatile(
"movd %3, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
ASMALIGN(4)
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"movq %%mm0, (%2, %0) \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" // FIXME optimize & bench
" jb 1b \n\t"
: "+r" (i)
: "r"(basis), "r"(rem), "g"(scale)
);
}else{
for(i=0; i<8*8; i++){
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
}
}
}