Do not misuse long as the size of a register in x86.
typedef x86_reg as the appropriate size and use it instead. Originally committed as revision 13081 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
35027eddf3
commit
40d0e665d0
@ -25,6 +25,7 @@
|
||||
#include "dsputil.h"
|
||||
#include "dsputil_mmx.h"
|
||||
#include "common.h"
|
||||
#include "x86_cpu.h"
|
||||
|
||||
/*****************************************************************************
|
||||
*
|
||||
@ -301,7 +302,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
||||
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(h==16){\
|
||||
@ -316,7 +317,7 @@ static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
||||
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ADD), "m"(MUL1), "m"(MUL2)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -367,7 +368,7 @@ static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstSt
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+m"(h)\
|
||||
: "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
|
@ -42,7 +42,7 @@ int mm_support(void)
|
||||
int rval = 0;
|
||||
int eax, ebx, ecx, edx;
|
||||
int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
|
||||
long a, c;
|
||||
x86_reg a, c;
|
||||
|
||||
asm volatile (
|
||||
/* See if CPUID instruction is supported ... */
|
||||
|
@ -249,7 +249,7 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*
|
||||
"sub $2, %2 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+r"(dst), "+r"(src), "+r"(h)
|
||||
: "r"((long)stride), "m"(ff_pw_32), "m"(x), "m"(y)
|
||||
: "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y)
|
||||
);
|
||||
}
|
||||
|
||||
@ -300,7 +300,7 @@ static void H264_CHROMA_MC2_TMPL(uint8_t *dst/*align 2*/, uint8_t *src/*align 1*
|
||||
"sub $1, %2\n\t"
|
||||
"jnz 1b\n\t"
|
||||
: "+r" (dst), "+r"(src), "+r"(h)
|
||||
: "m" (ff_pw_32), "r"((long)stride)
|
||||
: "m" (ff_pw_32), "r"((x86_reg)stride)
|
||||
: "%esi");
|
||||
|
||||
}
|
||||
|
@ -72,7 +72,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
|
||||
"lea (%0,%3,2), %0 \n\t"
|
||||
"jg 1b \n\t"
|
||||
:"+r"(dst), "+r"(src), "+r"(h)
|
||||
:"r"((long)stride)
|
||||
:"r"((x86_reg)stride)
|
||||
);
|
||||
} else {
|
||||
asm volatile(
|
||||
@ -100,7 +100,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
|
||||
"lea (%0,%3,2), %0 \n\t"
|
||||
"jg 1b \n\t"
|
||||
:"+r"(dst), "+r"(src), "+r"(h)
|
||||
:"r"((long)stride)
|
||||
:"r"((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
return;
|
||||
@ -154,7 +154,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
|
||||
"lea (%0,%3,2), %0 \n\t"
|
||||
"jg 1b \n\t"
|
||||
:"+r"(dst), "+r"(src), "+r"(h)
|
||||
:"r"((long)stride)
|
||||
:"r"((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
@ -202,7 +202,7 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*
|
||||
"lea (%0,%3,2), %0 \n\t"
|
||||
"jg 1b \n\t"
|
||||
:"+r"(dst), "+r"(src), "+r"(h)
|
||||
:"r"((long)stride)
|
||||
:"r"((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -240,7 +240,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
|
||||
"movq %%mm2, (%0, %1) \n\t"
|
||||
"movq %%mm4, (%0, %1, 2) \n\t"
|
||||
"movq %%mm6, (%0, %2) \n\t"
|
||||
::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
|
||||
::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
|
||||
:"memory");
|
||||
pix += line_size*4;
|
||||
p += 32;
|
||||
@ -265,7 +265,7 @@ void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size
|
||||
"movq %%mm2, (%0, %1) \n\t"
|
||||
"movq %%mm4, (%0, %1, 2) \n\t"
|
||||
"movq %%mm6, (%0, %2) \n\t"
|
||||
::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
|
||||
::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
@ -349,7 +349,7 @@ static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+g"(h), "+r" (pixels), "+r" (block)
|
||||
: "r"((long)line_size)
|
||||
: "r"((x86_reg)line_size)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
}
|
||||
@ -375,7 +375,7 @@ static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+g"(h), "+r" (pixels), "+r" (block)
|
||||
: "r"((long)line_size)
|
||||
: "r"((x86_reg)line_size)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
}
|
||||
@ -409,7 +409,7 @@ static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_siz
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+g"(h), "+r" (pixels), "+r" (block)
|
||||
: "r"((long)line_size)
|
||||
: "r"((x86_reg)line_size)
|
||||
: "%"REG_a, "memory"
|
||||
);
|
||||
}
|
||||
@ -431,7 +431,7 @@ static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si
|
||||
"lea (%2,%3,4), %2 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+g"(h), "+r" (pixels), "+r" (block)
|
||||
: "r"((long)line_size), "r"(3L*line_size)
|
||||
: "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
@ -457,7 +457,7 @@ static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_si
|
||||
"lea (%2,%3,4), %2 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+g"(h), "+r" (pixels), "+r" (block)
|
||||
: "r"((long)line_size), "r"(3L*line_size)
|
||||
: "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
@ -480,7 +480,7 @@ static void clear_blocks_mmx(DCTELEM *blocks)
|
||||
}
|
||||
|
||||
static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
|
||||
long i=0;
|
||||
x86_reg i=0;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movq (%1, %0), %%mm0 \n\t"
|
||||
@ -495,14 +495,14 @@ static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (i)
|
||||
: "r"(src), "r"(dst), "r"((long)w-15)
|
||||
: "r"(src), "r"(dst), "r"((x86_reg)w-15)
|
||||
);
|
||||
for(; i<w; i++)
|
||||
dst[i+0] += src[i+0];
|
||||
}
|
||||
|
||||
static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
||||
long i=0;
|
||||
x86_reg i=0;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movq (%2, %0), %%mm0 \n\t"
|
||||
@ -515,7 +515,7 @@ static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
||||
"cmp %4, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (i)
|
||||
: "r"(dst), "r"(src1), "r"(src2), "r"((long)w-15)
|
||||
: "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
|
||||
);
|
||||
for(; i<w; i++)
|
||||
dst[i] = src1[i] + src2[i];
|
||||
@ -689,8 +689,8 @@ static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
|
||||
"movd %%mm6, (%1,%3) \n\t"
|
||||
:: "r" (src),
|
||||
"r" (src + 4*stride),
|
||||
"r" ((long) stride ),
|
||||
"r" ((long)(3*stride))
|
||||
"r" ((x86_reg) stride ),
|
||||
"r" ((x86_reg)(3*stride))
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -723,7 +723,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
|
||||
: "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
|
||||
);
|
||||
}
|
||||
else
|
||||
@ -746,7 +746,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
|
||||
"cmp %3, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((long)wrap), "r" ((long)width), "r" (ptr + wrap*height)
|
||||
: "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
|
||||
);
|
||||
}
|
||||
|
||||
@ -764,7 +764,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
|
||||
"cmp %4, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((long)buf - (long)ptr - w), "r" ((long)-wrap), "r" ((long)-wrap*3), "r" (ptr+width+2*w)
|
||||
: "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
|
||||
);
|
||||
ptr= last_line + (i + 1) * wrap - w;
|
||||
asm volatile(
|
||||
@ -778,7 +778,7 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
|
||||
"cmp %4, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (ptr)
|
||||
: "r" ((long)last_line - (long)ptr - w), "r" ((long)wrap), "r" ((long)wrap*3), "r" (ptr+width+2*w)
|
||||
: "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -786,8 +786,8 @@ static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
|
||||
#define PAETH(cpu, abs3)\
|
||||
void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
|
||||
{\
|
||||
long i = -bpp;\
|
||||
long end = w-3;\
|
||||
x86_reg i = -bpp;\
|
||||
x86_reg end = w-3;\
|
||||
asm volatile(\
|
||||
"pxor %%mm7, %%mm7 \n"\
|
||||
"movd (%1,%0), %%mm0 \n"\
|
||||
@ -830,7 +830,7 @@ void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, in
|
||||
"cmp %5, %0 \n"\
|
||||
"jle 1b \n"\
|
||||
:"+r"(i)\
|
||||
:"r"(dst), "r"(top), "r"(src), "r"((long)bpp), "g"(end),\
|
||||
:"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
|
||||
"m"(ff_pw_255)\
|
||||
:"memory"\
|
||||
);\
|
||||
@ -994,7 +994,7 @@ static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, in
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+D"(h)\
|
||||
: "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1105,7 +1105,7 @@ static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+d"(h)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1169,7 +1169,7 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
|
||||
: "r" ((long)srcStride)\
|
||||
: "r" ((x86_reg)srcStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
\
|
||||
@ -1216,7 +1216,7 @@ static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
|
||||
" jnz 1b \n\t"\
|
||||
\
|
||||
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
|
||||
: "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\
|
||||
: "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
|
||||
:"memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1241,7 +1241,7 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+r" (src), "+r" (temp_ptr), "+r"(count)\
|
||||
: "r" ((long)srcStride)\
|
||||
: "r" ((x86_reg)srcStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
\
|
||||
@ -1276,7 +1276,7 @@ static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src,
|
||||
" jnz 1b \n\t"\
|
||||
\
|
||||
: "+r"(temp_ptr), "+r"(dst), "+g"(count)\
|
||||
: "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\
|
||||
: "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1839,7 +1839,7 @@ static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
|
||||
}
|
||||
|
||||
static void vector_fmul_3dnow(float *dst, const float *src, int len){
|
||||
long i = (len-4)*4;
|
||||
x86_reg i = (len-4)*4;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movq (%1,%0), %%mm0 \n\t"
|
||||
@ -1857,7 +1857,7 @@ static void vector_fmul_3dnow(float *dst, const float *src, int len){
|
||||
);
|
||||
}
|
||||
static void vector_fmul_sse(float *dst, const float *src, int len){
|
||||
long i = (len-8)*4;
|
||||
x86_reg i = (len-8)*4;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movaps (%1,%0), %%xmm0 \n\t"
|
||||
@ -1875,7 +1875,7 @@ static void vector_fmul_sse(float *dst, const float *src, int len){
|
||||
}
|
||||
|
||||
static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
|
||||
long i = len*4-16;
|
||||
x86_reg i = len*4-16;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"pswapd 8(%1), %%mm0 \n\t"
|
||||
@ -1893,7 +1893,7 @@ static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const floa
|
||||
asm volatile("femms");
|
||||
}
|
||||
static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
|
||||
long i = len*4-32;
|
||||
x86_reg i = len*4-32;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movaps 16(%1), %%xmm0 \n\t"
|
||||
@ -1914,7 +1914,7 @@ static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *
|
||||
|
||||
static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float *src1,
|
||||
const float *src2, int src3, int len, int step){
|
||||
long i = (len-4)*4;
|
||||
x86_reg i = (len-4)*4;
|
||||
if(step == 2 && src3 == 0){
|
||||
dst += (len-4)*2;
|
||||
asm volatile(
|
||||
@ -1963,7 +1963,7 @@ static void vector_fmul_add_add_3dnow(float *dst, const float *src0, const float
|
||||
}
|
||||
static void vector_fmul_add_add_sse(float *dst, const float *src0, const float *src1,
|
||||
const float *src2, int src3, int len, int step){
|
||||
long i = (len-8)*4;
|
||||
x86_reg i = (len-8)*4;
|
||||
if(step == 2 && src3 == 0){
|
||||
dst += (len-8)*2;
|
||||
asm volatile(
|
||||
|
@ -55,7 +55,7 @@ static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -105,7 +105,7 @@ static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
@ -152,7 +152,7 @@ static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
@ -222,7 +222,7 @@ static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
@ -277,7 +277,7 @@ static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
@ -329,7 +329,7 @@ static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
@ -373,7 +373,7 @@ static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -417,7 +417,7 @@ static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
@ -471,7 +471,7 @@ static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
@ -544,7 +544,7 @@ static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *sr
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
//the following should be used, though better not with gcc ...
|
||||
/* :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
|
||||
@ -586,7 +586,7 @@ static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, in
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -616,7 +616,7 @@ static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D" (block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -650,7 +650,7 @@ static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, in
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D" (block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -678,7 +678,7 @@ static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_siz
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -710,7 +710,7 @@ static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -748,7 +748,7 @@ static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -791,7 +791,7 @@ static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r" ((long)line_size)
|
||||
:"r" ((x86_reg)line_size)
|
||||
:"%"REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -812,7 +812,7 @@ static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_siz
|
||||
"movd %%mm2, (%1, %2, 2) \n\t"
|
||||
"movd %%mm3, (%1, %3) \n\t"
|
||||
::"S"(pixels), "D"(block),
|
||||
"r" ((long)line_size), "r"(3L*line_size)
|
||||
"r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
|
||||
:"memory");
|
||||
block += 4*line_size;
|
||||
pixels += 4*line_size;
|
||||
@ -868,8 +868,8 @@ static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride
|
||||
"decl %0 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
:"+g"(h), "+r"(src)\
|
||||
:"r"((long)off1), "r"((long)off2),\
|
||||
"r"((long)(dst-src)), "r"((long)stride)\
|
||||
:"r"((x86_reg)off1), "r"((x86_reg)off2),\
|
||||
"r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
|
||||
:"memory"\
|
||||
);\
|
||||
}\
|
||||
@ -885,8 +885,8 @@ static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride,
|
||||
"decl %0 \n\t"\
|
||||
"jnz 1b \n\t"\
|
||||
:"+g"(h), "+r"(src)\
|
||||
:"r"((long)off1), "r"((long)off2),\
|
||||
"r"((long)(dst-src)), "r"((long)stride)\
|
||||
:"r"((x86_reg)off1), "r"((x86_reg)off2),\
|
||||
"r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
|
||||
:"memory"\
|
||||
);\
|
||||
}
|
||||
|
@ -30,7 +30,7 @@
|
||||
|
||||
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
|
||||
{
|
||||
long i=0;
|
||||
x86_reg i=0;
|
||||
|
||||
assert(FFABS(scale) < MAX_ABS);
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
@ -72,7 +72,7 @@ static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[
|
||||
|
||||
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
|
||||
{
|
||||
long i=0;
|
||||
x86_reg i=0;
|
||||
|
||||
if(FFABS(scale) < MAX_ABS){
|
||||
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
|
||||
|
@ -57,7 +57,7 @@ static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((long)line_size)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -107,7 +107,7 @@ static void av_unused DEF(put, pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
@ -153,7 +153,7 @@ static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, int lin
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((long)line_size)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -202,7 +202,7 @@ static void av_unused DEF(put, pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t
|
||||
#else
|
||||
:"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
|
||||
#endif
|
||||
:"S"((long)src1Stride), "D"((long)dstStride)
|
||||
:"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
|
||||
:"memory");
|
||||
}
|
||||
|
||||
@ -231,7 +231,7 @@ static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((long)line_size)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -297,7 +297,7 @@ static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels)
|
||||
:"D"(block), "r"((long)line_size)
|
||||
:"D"(block), "r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -493,7 +493,7 @@ static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line
|
||||
"subl $4, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels), "+D"(block)
|
||||
:"r"((long)line_size)
|
||||
:"r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
@ -568,7 +568,7 @@ static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int lin
|
||||
"subl $2, %0 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
:"+g"(h), "+S"(pixels)
|
||||
:"D"(block), "r"((long)line_size)
|
||||
:"D"(block), "r"((x86_reg)line_size)
|
||||
:REG_a, "memory");
|
||||
}
|
||||
|
||||
|
@ -51,7 +51,7 @@ static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
|
||||
"add $32, %%"REG_a" \n\t"
|
||||
"js 1b \n\t"
|
||||
: "+r" (pixels)
|
||||
: "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
|
||||
: "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
|
||||
: "%"REG_a
|
||||
);
|
||||
}
|
||||
@ -80,7 +80,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
|
||||
"add $16, %%"REG_a" \n\t"
|
||||
"jnz 1b \n\t"
|
||||
: "+r" (s1), "+r" (s2)
|
||||
: "r" (block+64), "r" ((long)stride)
|
||||
: "r" (block+64), "r" ((x86_reg)stride)
|
||||
: "%"REG_a
|
||||
);
|
||||
}
|
||||
@ -88,7 +88,7 @@ static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint
|
||||
static int pix_sum16_mmx(uint8_t * pix, int line_size){
|
||||
const int h=16;
|
||||
int sum;
|
||||
long index= -line_size*h;
|
||||
x86_reg index= -line_size*h;
|
||||
|
||||
asm volatile(
|
||||
"pxor %%mm7, %%mm7 \n\t"
|
||||
@ -117,7 +117,7 @@ static int pix_sum16_mmx(uint8_t * pix, int line_size){
|
||||
"movd %%mm6, %0 \n\t"
|
||||
"andl $0xFFFF, %0 \n\t"
|
||||
: "=&r" (sum), "+r" (index)
|
||||
: "r" (pix - index), "r" ((long)line_size)
|
||||
: "r" (pix - index), "r" ((x86_reg)line_size)
|
||||
);
|
||||
|
||||
return sum;
|
||||
@ -162,7 +162,7 @@ static int pix_norm1_mmx(uint8_t *pix, int line_size) {
|
||||
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
|
||||
"paddd %%mm7,%%mm1\n"
|
||||
"movd %%mm1,%1\n"
|
||||
: "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
|
||||
: "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
|
||||
return tmp;
|
||||
}
|
||||
|
||||
@ -222,7 +222,7 @@ static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
|
||||
"paddd %%mm7,%%mm1\n"
|
||||
"movd %%mm1,%2\n"
|
||||
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "m" (h)
|
||||
: "r" ((x86_reg)line_size) , "m" (h)
|
||||
: "%ecx");
|
||||
return tmp;
|
||||
}
|
||||
@ -282,7 +282,7 @@ static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int
|
||||
"paddd %%mm7,%%mm1\n"
|
||||
"movd %%mm1,%2\n"
|
||||
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "m" (h)
|
||||
: "r" ((x86_reg)line_size) , "m" (h)
|
||||
: "%ecx");
|
||||
return tmp;
|
||||
}
|
||||
@ -345,7 +345,7 @@ static int sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
|
||||
"paddd %%xmm1,%%xmm7\n"
|
||||
"movd %%xmm7,%3\n"
|
||||
: "+r" (pix1), "+r" (pix2), "+r"(h), "=r"(tmp)
|
||||
: "r" ((long)line_size));
|
||||
: "r" ((x86_reg)line_size));
|
||||
return tmp;
|
||||
}
|
||||
|
||||
@ -469,7 +469,7 @@ static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
|
||||
"paddd %%mm6,%%mm0\n"
|
||||
"movd %%mm0,%1\n"
|
||||
: "+r" (pix1), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "g" (h-2)
|
||||
: "r" ((x86_reg)line_size) , "g" (h-2)
|
||||
: "%ecx");
|
||||
return tmp;
|
||||
}
|
||||
@ -583,7 +583,7 @@ static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
|
||||
"paddd %%mm6,%%mm0\n"
|
||||
"movd %%mm0,%1\n"
|
||||
: "+r" (pix1), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "g" (h-2)
|
||||
: "r" ((x86_reg)line_size) , "g" (h-2)
|
||||
: "%ecx");
|
||||
return tmp + hf_noise8_mmx(pix+8, line_size, h);
|
||||
}
|
||||
@ -665,7 +665,7 @@ static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
|
||||
"paddw %%mm6,%%mm0\n"
|
||||
"movd %%mm0,%1\n"
|
||||
: "+r" (pix), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "m" (h)
|
||||
: "r" ((x86_reg)line_size) , "m" (h)
|
||||
: "%ecx");
|
||||
return tmp & 0xFFFF;
|
||||
}
|
||||
@ -706,7 +706,7 @@ static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_s
|
||||
|
||||
"movd %%mm6,%1\n"
|
||||
: "+r" (pix), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "m" (h)
|
||||
: "r" ((x86_reg)line_size) , "m" (h)
|
||||
: "%ecx");
|
||||
return tmp;
|
||||
}
|
||||
@ -785,7 +785,7 @@ static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
|
||||
"paddw %%mm6,%%mm0\n"
|
||||
"movd %%mm0,%2\n"
|
||||
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "m" (h)
|
||||
: "r" ((x86_reg)line_size) , "m" (h)
|
||||
: "%ecx");
|
||||
return tmp & 0x7FFF;
|
||||
}
|
||||
@ -843,14 +843,14 @@ static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, i
|
||||
|
||||
"movd %%mm6,%2\n"
|
||||
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
|
||||
: "r" ((long)line_size) , "m" (h)
|
||||
: "r" ((x86_reg)line_size) , "m" (h)
|
||||
: "%ecx");
|
||||
return tmp;
|
||||
}
|
||||
#undef SUM
|
||||
|
||||
static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
||||
long i=0;
|
||||
x86_reg i=0;
|
||||
asm volatile(
|
||||
"1: \n\t"
|
||||
"movq (%2, %0), %%mm0 \n\t"
|
||||
@ -865,14 +865,14 @@ static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
|
||||
"cmp %4, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (i)
|
||||
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
|
||||
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
|
||||
);
|
||||
for(; i<w; i++)
|
||||
dst[i+0] = src1[i+0]-src2[i+0];
|
||||
}
|
||||
|
||||
static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
|
||||
long i=0;
|
||||
x86_reg i=0;
|
||||
uint8_t l, lt;
|
||||
|
||||
asm volatile(
|
||||
@ -895,7 +895,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
|
||||
"cmp %4, %0 \n\t"
|
||||
" jb 1b \n\t"
|
||||
: "+r" (i)
|
||||
: "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
|
||||
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
|
||||
);
|
||||
|
||||
l= *left;
|
||||
@ -930,7 +930,7 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
|
||||
DIFF_PIXELS_1(m0, mm##7, mm##0, (%1,%3,4), (%2,%3,4))\
|
||||
"mov"#m1" %0, "#mm"0 \n\t"\
|
||||
: "+m"(temp), "+r"(p1b), "+r"(p2b)\
|
||||
: "r"((long)stride), "r"((long)stride*3)\
|
||||
: "r"((x86_reg)stride), "r"((x86_reg)stride*3)\
|
||||
);\
|
||||
}
|
||||
//the "+m"(temp) is needed as gcc 2.95 sometimes fails to compile "=m"(temp)
|
||||
@ -1237,7 +1237,7 @@ DCT_SAD_FUNC(ssse3)
|
||||
|
||||
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
|
||||
int sum;
|
||||
long i=size;
|
||||
x86_reg i=size;
|
||||
asm volatile(
|
||||
"pxor %%mm4, %%mm4 \n"
|
||||
"1: \n"
|
||||
|
@ -20,6 +20,7 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include "dsputil.h"
|
||||
#include "x86_cpu.h"
|
||||
|
||||
static const int p1m1[2] __attribute__((aligned(8))) =
|
||||
{ 0, 1 << 31 };
|
||||
@ -30,7 +31,8 @@ static const int m1p1[2] __attribute__((aligned(8))) =
|
||||
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int ln = s->nbits;
|
||||
long i, j;
|
||||
long j;
|
||||
x86_reg i;
|
||||
long nblocks, nloops;
|
||||
FFTComplex *p, *cptr;
|
||||
|
||||
|
@ -20,6 +20,7 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include "dsputil.h"
|
||||
#include "x86_cpu.h"
|
||||
|
||||
static const int p1m1[2] __attribute__((aligned(8))) =
|
||||
{ 0, 1 << 31 };
|
||||
@ -30,7 +31,8 @@ static const int m1p1[2] __attribute__((aligned(8))) =
|
||||
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int ln = s->nbits;
|
||||
long i, j;
|
||||
long j;
|
||||
x86_reg i;
|
||||
long nblocks, nloops;
|
||||
FFTComplex *p, *cptr;
|
||||
|
||||
@ -124,7 +126,8 @@ void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z)
|
||||
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
|
||||
const FFTSample *input, FFTSample *tmp)
|
||||
{
|
||||
long k, n8, n4, n2, n;
|
||||
long n8, n4, n2, n;
|
||||
x86_reg k;
|
||||
const uint16_t *revtab = s->fft.revtab;
|
||||
const FFTSample *tcos = s->tcos;
|
||||
const FFTSample *tsin = s->tsin;
|
||||
|
@ -19,6 +19,7 @@
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
#include "dsputil.h"
|
||||
#include "x86_cpu.h"
|
||||
|
||||
static const int p1p1p1m1[4] __attribute__((aligned(16))) =
|
||||
{ 0, 0, 0, 1 << 31 };
|
||||
@ -48,7 +49,8 @@ static void print_v4sf(const char *str, __m128 a)
|
||||
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
|
||||
{
|
||||
int ln = s->nbits;
|
||||
long i, j;
|
||||
x86_reg i;
|
||||
long j;
|
||||
long nblocks, nloops;
|
||||
FFTComplex *p, *cptr;
|
||||
|
||||
@ -142,7 +144,8 @@ void ff_fft_calc_sse(FFTContext *s, FFTComplex *z)
|
||||
void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output,
|
||||
const FFTSample *input, FFTSample *tmp)
|
||||
{
|
||||
long k, n8, n4, n2, n;
|
||||
x86_reg k;
|
||||
long n8, n4, n2, n;
|
||||
const uint16_t *revtab = s->fft.revtab;
|
||||
const FFTSample *tcos = s->tcos;
|
||||
const FFTSample *tsin = s->tsin;
|
||||
|
@ -20,13 +20,14 @@
|
||||
*/
|
||||
|
||||
#include "dsputil_mmx.h"
|
||||
#include "x86_cpu.h"
|
||||
|
||||
static void apply_welch_window_sse2(const int32_t *data, int len, double *w_data)
|
||||
{
|
||||
double c = 2.0 / (len-1.0);
|
||||
int n2 = len>>1;
|
||||
long i = -n2*sizeof(int32_t);
|
||||
long j = n2*sizeof(int32_t);
|
||||
x86_reg i = -n2*sizeof(int32_t);
|
||||
x86_reg j = n2*sizeof(int32_t);
|
||||
asm volatile(
|
||||
"movsd %0, %%xmm7 \n\t"
|
||||
"movapd %1, %%xmm6 \n\t"
|
||||
@ -71,7 +72,7 @@ void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
|
||||
double *data1 = tmp + lag;
|
||||
int j;
|
||||
|
||||
if((long)data1 & 15)
|
||||
if((x86_reg)data1 & 15)
|
||||
data1++;
|
||||
|
||||
apply_welch_window_sse2(data, len, data1);
|
||||
@ -81,7 +82,7 @@ void ff_flac_compute_autocorr_sse2(const int32_t *data, int len, int lag,
|
||||
data1[len] = 0.0;
|
||||
|
||||
for(j=0; j<lag; j+=2){
|
||||
long i = -len*sizeof(double);
|
||||
x86_reg i = -len*sizeof(double);
|
||||
if(j == lag-2) {
|
||||
asm volatile(
|
||||
"movsd %6, %%xmm0 \n\t"
|
||||
|
@ -96,7 +96,7 @@ static int decode_significance_8x8_x86(CABACContext *c,
|
||||
int *index, const uint8_t *sig_off){
|
||||
int minusindex= 4-(int)index;
|
||||
int coeff_count;
|
||||
long last=0;
|
||||
x86_reg last=0;
|
||||
asm volatile(
|
||||
"movl "RANGE "(%3), %%esi \n\t"
|
||||
"movl "LOW "(%3), %%ebx \n\t"
|
||||
|
@ -86,7 +86,7 @@ static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
|
||||
"add %1, %0 \n\t"
|
||||
STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
|
||||
: "+r"(dst)
|
||||
: "r" ((long)stride)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
@ -294,7 +294,7 @@ static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
|
||||
STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
|
||||
STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
|
||||
:"+r"(dst)
|
||||
:"r"(block), "r"((long)stride), "r"(3L*stride), "m"(ff_pw_32)
|
||||
:"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
|
||||
);
|
||||
}
|
||||
|
||||
@ -503,7 +503,7 @@ static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alph
|
||||
"movq %%mm2, (%2) \n\t"
|
||||
|
||||
: "=m"(*tmp0)
|
||||
: "r"(pix-3*stride), "r"(pix), "r"((long)stride),
|
||||
: "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
|
||||
"m"(*tmp0/*unused*/), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
|
||||
"m"(ff_bone)
|
||||
);
|
||||
@ -550,7 +550,7 @@ static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int al
|
||||
"movq %%mm1, (%0,%2) \n\t"
|
||||
"movq %%mm2, (%1) \n\t"
|
||||
|
||||
:: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
|
||||
:: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
|
||||
"r"(*(uint32_t*)tc0),
|
||||
"m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
|
||||
);
|
||||
@ -601,7 +601,7 @@ static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride,
|
||||
"paddb %%mm6, %%mm2 \n\t"
|
||||
"movq %%mm1, (%0,%2) \n\t"
|
||||
"movq %%mm2, (%1) \n\t"
|
||||
:: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
|
||||
:: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
|
||||
"m"(alpha1), "m"(beta1), "m"(ff_bone)
|
||||
);
|
||||
}
|
||||
@ -797,7 +797,7 @@ static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uin
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+g"(h)\
|
||||
: "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -840,7 +840,7 @@ static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst,
|
||||
"add %4, %1 \n\t"\
|
||||
"add %3, %2 \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+d"(src2)\
|
||||
: "D"((long)src2Stride), "S"((long)dstStride)\
|
||||
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
}while(--h);\
|
||||
@ -870,7 +870,7 @@ static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uin
|
||||
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -902,7 +902,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
|
||||
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
|
||||
\
|
||||
: "+a"(src)\
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
tmp += 4;\
|
||||
@ -931,7 +931,7 @@ static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, in
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(tmp), "+c"(dst), "+g"(h)\
|
||||
: "S"((long)dstStride)\
|
||||
: "S"((x86_reg)dstStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -988,7 +988,7 @@ static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uin
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+g"(h)\
|
||||
: "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1049,7 +1049,7 @@ static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst,
|
||||
"add %4, %1 \n\t"\
|
||||
"add %3, %2 \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+d"(src2)\
|
||||
: "D"((long)src2Stride), "S"((long)dstStride),\
|
||||
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
|
||||
"m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
@ -1088,7 +1088,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst,
|
||||
QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(h==16){\
|
||||
@ -1103,7 +1103,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst,
|
||||
QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1141,7 +1141,7 @@ static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_
|
||||
QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
|
||||
QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
|
||||
: "+a"(src)\
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(size==16){\
|
||||
@ -1155,7 +1155,7 @@ static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_
|
||||
QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
|
||||
QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
|
||||
: "+a"(src)\
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1202,7 +1202,7 @@ static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(tmp), "+c"(dst), "+g"(h)\
|
||||
: "S"((long)dstStride)\
|
||||
: "S"((x86_reg)dstStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
tmp += 8 - size*24;\
|
||||
@ -1275,7 +1275,7 @@ static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_
|
||||
OP(%%mm0, (%2), %%mm4, d)\
|
||||
OP(%%mm1, (%2,%4), %%mm5, d)\
|
||||
:"+a"(src8), "+c"(src16), "+d"(dst)\
|
||||
:"S"((long)src8Stride), "D"((long)dstStride)\
|
||||
:"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
|
||||
:"memory");\
|
||||
}\
|
||||
static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
|
||||
@ -1297,7 +1297,7 @@ static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_
|
||||
OP(%%mm0, (%2), %%mm5, q)\
|
||||
OP(%%mm2, (%2,%4), %%mm5, q)\
|
||||
::"a"(src8), "c"(src16), "d"(dst),\
|
||||
"r"((long)src8Stride), "r"((long)dstStride)\
|
||||
"r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
|
||||
:"memory");\
|
||||
src8 += 2L*src8Stride;\
|
||||
src16 += 48;\
|
||||
@ -1372,7 +1372,7 @@ static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst,
|
||||
"decl %3 \n\t"\
|
||||
"jg 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
|
||||
: "D"((long)src2Stride), "S"((long)dstStride),\
|
||||
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
|
||||
"m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
@ -1430,7 +1430,7 @@ static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst,
|
||||
"add %4, %1 \n\t"\
|
||||
"add %3, %2 \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+d"(src2)\
|
||||
: "D"((long)src2Stride), "S"((long)dstStride),\
|
||||
: "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
|
||||
"m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
@ -1473,7 +1473,7 @@ static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uin
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(src), "+c"(dst), "+g"(h)\
|
||||
: "D"((long)srcStride), "S"((long)dstStride),\
|
||||
: "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
|
||||
"m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
@ -1518,7 +1518,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst,
|
||||
QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
if(h==16){\
|
||||
@ -1533,7 +1533,7 @@ static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst,
|
||||
QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
|
||||
\
|
||||
: "+a"(src), "+c"(dst)\
|
||||
: "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
@ -1576,7 +1576,7 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, u
|
||||
QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
|
||||
QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
|
||||
: "+a"(src)
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
|
||||
: "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
|
||||
: "memory"
|
||||
);
|
||||
if(size==16){
|
||||
@ -1590,7 +1590,7 @@ static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, u
|
||||
QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
|
||||
QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
|
||||
: "+a"(src)
|
||||
: "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
|
||||
: "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
@ -1654,7 +1654,7 @@ static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(tmp), "+c"(dst), "+g"(h)\
|
||||
: "S"((long)dstStride)\
|
||||
: "S"((x86_reg)dstStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
}else{\
|
||||
@ -1688,7 +1688,7 @@ static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_
|
||||
"decl %2 \n\t"\
|
||||
" jnz 1b \n\t"\
|
||||
: "+a"(tmp), "+c"(dst), "+g"(h)\
|
||||
: "S"((long)dstStride)\
|
||||
: "S"((x86_reg)dstStride)\
|
||||
: "memory"\
|
||||
);\
|
||||
}\
|
||||
|
@ -34,7 +34,7 @@ DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
|
||||
|
||||
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
long len= -(stride*h);
|
||||
x86_reg len= -(stride*h);
|
||||
asm volatile(
|
||||
ASMALIGN(4)
|
||||
"1: \n\t"
|
||||
@ -64,7 +64,7 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
"add %3, %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
|
||||
: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
@ -84,7 +84,7 @@ static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((long)stride)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
@ -106,7 +106,7 @@ static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((long)stride)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
asm volatile(
|
||||
"movhlps %%xmm6, %%xmm0 \n\t"
|
||||
@ -135,7 +135,7 @@ static inline void sad8_x2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((long)stride)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
@ -160,7 +160,7 @@ static inline void sad8_y2a_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((long)stride)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
@ -190,13 +190,13 @@ static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
"sub $2, %0 \n\t"
|
||||
" jg 1b \n\t"
|
||||
: "+r" (h), "+r" (blk1), "+r" (blk2)
|
||||
: "r" ((long)stride)
|
||||
: "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
long len= -(stride*h);
|
||||
x86_reg len= -(stride*h);
|
||||
asm volatile(
|
||||
ASMALIGN(4)
|
||||
"1: \n\t"
|
||||
@ -228,13 +228,13 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int
|
||||
"add %4, %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
|
||||
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
{
|
||||
long len= -(stride*h);
|
||||
x86_reg len= -(stride*h);
|
||||
asm volatile(
|
||||
"movq (%1, %%"REG_a"), %%mm0 \n\t"
|
||||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
|
||||
@ -281,7 +281,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
|
||||
"add %4, %%"REG_a" \n\t"
|
||||
" js 1b \n\t"
|
||||
: "+a" (len)
|
||||
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
|
||||
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@ extern uint16_t inv_zigzag_direct16[64];
|
||||
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
long level, qmul, qadd, nCoeffs;
|
||||
x86_reg level, qmul, qadd, nCoeffs;
|
||||
|
||||
qmul = qscale << 1;
|
||||
|
||||
@ -109,7 +109,7 @@ asm volatile(
|
||||
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
long qmul, qadd, nCoeffs;
|
||||
x86_reg qmul, qadd, nCoeffs;
|
||||
|
||||
qmul = qscale << 1;
|
||||
qadd = (qscale - 1) | 1;
|
||||
@ -200,7 +200,7 @@ asm volatile(
|
||||
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
long nCoeffs;
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
int block0;
|
||||
|
||||
@ -269,7 +269,7 @@ asm volatile(
|
||||
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
long nCoeffs;
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
|
||||
assert(s->block_last_index[n]>=0);
|
||||
@ -335,7 +335,7 @@ asm volatile(
|
||||
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
long nCoeffs;
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
int block0;
|
||||
|
||||
@ -401,7 +401,7 @@ asm volatile(
|
||||
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
|
||||
DCTELEM *block, int n, int qscale)
|
||||
{
|
||||
long nCoeffs;
|
||||
x86_reg nCoeffs;
|
||||
const uint16_t *quant_matrix;
|
||||
|
||||
assert(s->block_last_index[n]>=0);
|
||||
|
@ -95,7 +95,7 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
|
||||
DCTELEM *block, int n,
|
||||
int qscale, int *overflow)
|
||||
{
|
||||
long last_non_zero_p1;
|
||||
x86_reg last_non_zero_p1;
|
||||
int level=0, q; //=0 is because gcc says uninitialized ...
|
||||
const uint16_t *qmat, *bias;
|
||||
DECLARE_ALIGNED_16(int16_t, temp_block[64]);
|
||||
|
@ -73,7 +73,7 @@ void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){
|
||||
IDWTELEM * const dst = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((long)&dst[i]) & 0x1F) && i<w_r; i++){
|
||||
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
|
||||
dst[i] = dst[i] - (b[i] + b[i + 1]);
|
||||
}
|
||||
for(; i<w_r-15; i+=16){
|
||||
@ -146,7 +146,7 @@ void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, int width){
|
||||
IDWTELEM * const src = b+w2;
|
||||
|
||||
i = 0;
|
||||
for(; (((long)&temp[i]) & 0x1F) && i<w_r; i++){
|
||||
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
|
||||
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
|
||||
}
|
||||
for(; i<w_r-7; i+=8){
|
||||
@ -436,7 +436,7 @@ void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, int width){
|
||||
"movdqa %%"s3", %%"t3" \n\t"
|
||||
|
||||
void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
|
||||
long i = width;
|
||||
x86_reg i = width;
|
||||
|
||||
while(i & 0x1F)
|
||||
{
|
||||
@ -534,7 +534,7 @@ void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
|
||||
|
||||
|
||||
void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
|
||||
long i = width;
|
||||
x86_reg i = width;
|
||||
while(i & 15)
|
||||
{
|
||||
i--;
|
||||
@ -605,7 +605,7 @@ void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, I
|
||||
|
||||
#define snow_inner_add_yblock_sse2_header \
|
||||
IDWTELEM * * dst_array = sb->line + src_y;\
|
||||
long tmp;\
|
||||
x86_reg tmp;\
|
||||
asm volatile(\
|
||||
"mov %7, %%"REG_c" \n\t"\
|
||||
"mov %6, %2 \n\t"\
|
||||
@ -667,7 +667,7 @@ void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, I
|
||||
"jnz 1b \n\t"\
|
||||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
|
||||
:\
|
||||
"rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
|
||||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\
|
||||
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
|
||||
|
||||
#define snow_inner_add_yblock_sse2_end_8\
|
||||
@ -684,8 +684,8 @@ void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, I
|
||||
"dec %2 \n\t"\
|
||||
snow_inner_add_yblock_sse2_end_common2
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
|
||||
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_sse2_header
|
||||
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
|
||||
snow_inner_add_yblock_sse2_accum_8("2", "8")
|
||||
@ -732,8 +732,8 @@ snow_inner_add_yblock_sse2_accum_8("0", "136")
|
||||
snow_inner_add_yblock_sse2_end_8
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
|
||||
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_sse2_header
|
||||
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
|
||||
snow_inner_add_yblock_sse2_accum_16("2", "16")
|
||||
@ -758,7 +758,7 @@ snow_inner_add_yblock_sse2_end_16
|
||||
|
||||
#define snow_inner_add_yblock_mmx_header \
|
||||
IDWTELEM * * dst_array = sb->line + src_y;\
|
||||
long tmp;\
|
||||
x86_reg tmp;\
|
||||
asm volatile(\
|
||||
"mov %7, %%"REG_c" \n\t"\
|
||||
"mov %6, %2 \n\t"\
|
||||
@ -815,11 +815,11 @@ snow_inner_add_yblock_sse2_end_16
|
||||
"jnz 1b \n\t"\
|
||||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
|
||||
:\
|
||||
"rm"((long)(src_x<<1)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\
|
||||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"((x86_reg)b_h),"m"((x86_reg)src_stride):\
|
||||
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
|
||||
|
||||
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
|
||||
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_mmx_header
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
|
||||
snow_inner_add_yblock_mmx_accum("2", "8", "0")
|
||||
@ -829,8 +829,8 @@ snow_inner_add_yblock_mmx_mix("0", "0")
|
||||
snow_inner_add_yblock_mmx_end("16")
|
||||
}
|
||||
|
||||
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h,
|
||||
int src_x, int src_y, long src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
|
||||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
|
||||
snow_inner_add_yblock_mmx_header
|
||||
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
|
||||
snow_inner_add_yblock_mmx_accum("2", "16", "0")
|
||||
|
@ -71,7 +71,7 @@ DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
|
||||
|
||||
/** Sacrifying mm6 allows to pipeline loads from src */
|
||||
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
const uint8_t *src, long int stride,
|
||||
const uint8_t *src, x86_reg stride,
|
||||
int rnd, int64_t shift)
|
||||
{
|
||||
asm volatile(
|
||||
@ -107,7 +107,7 @@ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||
* Data is already unpacked, so some operations can directly be made from
|
||||
* memory.
|
||||
*/
|
||||
static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
|
||||
static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
|
||||
const int16_t *src, int rnd)
|
||||
{
|
||||
int h = 8;
|
||||
@ -152,7 +152,7 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
|
||||
* Sacrify mm6 for *9 factor.
|
||||
*/
|
||||
static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
|
||||
long int stride, int rnd, long int offset)
|
||||
x86_reg stride, int rnd, x86_reg offset)
|
||||
{
|
||||
rnd = 8-rnd;
|
||||
asm volatile(
|
||||
@ -259,7 +259,7 @@ DECLARE_ALIGNED_16(const uint64_t, ff_pw_18) = 0x0012001200120012ULL;
|
||||
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
|
||||
static void \
|
||||
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
|
||||
long int src_stride, \
|
||||
x86_reg src_stride, \
|
||||
int rnd, int64_t shift) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
@ -314,7 +314,7 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
|
||||
*/
|
||||
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4) \
|
||||
static void \
|
||||
vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, long int stride, \
|
||||
vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
|
||||
const int16_t *src, int rnd) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
@ -353,7 +353,7 @@ vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, long int stride, \
|
||||
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4) \
|
||||
static void \
|
||||
vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
|
||||
long int stride, int rnd, long int offset) \
|
||||
x86_reg stride, int rnd, x86_reg offset) \
|
||||
{ \
|
||||
int h = 8; \
|
||||
src -= offset; \
|
||||
@ -387,9 +387,9 @@ MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%
|
||||
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
|
||||
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)")
|
||||
|
||||
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, long int src_stride, int rnd, int64_t shift);
|
||||
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, long int dst_stride, const int16_t *src, int rnd);
|
||||
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, long int stride, int rnd, long int offset);
|
||||
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
|
||||
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
|
||||
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
|
||||
|
||||
/**
|
||||
* Interpolates fractional pel values by applying proper vertical then
|
||||
|
@ -31,6 +31,7 @@
|
||||
# define REG_D "rdi"
|
||||
# define REG_S "rsi"
|
||||
# define PTR_SIZE "8"
|
||||
typedef int64_t x86_reg;
|
||||
|
||||
# define REG_SP "rsp"
|
||||
# define REG_BP "rbp"
|
||||
@ -50,6 +51,7 @@
|
||||
# define REG_D "edi"
|
||||
# define REG_S "esi"
|
||||
# define PTR_SIZE "4"
|
||||
typedef int32_t x86_reg;
|
||||
|
||||
# define REG_SP "esp"
|
||||
# define REG_BP "ebp"
|
||||
|
Loading…
Reference in New Issue
Block a user