Fix issue #301:
summary of changes: - Use MANGLE when loading some constants into MMX registers. - Convert those constants to non-static and thus add ff_ prefix. - Remove last parameter of MSPEL_FILTER13_CORE (was constant). - Use of "+r" instead of stricter but unnecessary "+g". - Use of REG_c and direct loading of some of the above. patch by Christophe GISQUET, christophe.gisquet free fr Subject: [FFmpeg-devel] [PATCH] Roundup issue #301 Date: Fri, 28 Dec 2007 19:22:18 +0100 Originally committed as revision 11376 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
8f8fae80b2
commit
ae904fd028
@ -55,34 +55,33 @@
|
|||||||
|
|
||||||
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
|
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
|
||||||
"paddw %%mm"#R2", %%mm"#R1" \n\t" \
|
"paddw %%mm"#R2", %%mm"#R1" \n\t" \
|
||||||
"movd (%1,%4), %%mm"#R0" \n\t" \
|
"movd (%0,%3), %%mm"#R0" \n\t" \
|
||||||
"pmullw %%mm6, %%mm"#R1" \n\t" \
|
"pmullw %%mm6, %%mm"#R1" \n\t" \
|
||||||
"punpcklbw %%mm0, %%mm"#R0" \n\t" \
|
"punpcklbw %%mm0, %%mm"#R0" \n\t" \
|
||||||
"movd (%1,%3), %%mm"#R3" \n\t" \
|
"movd (%0,%2), %%mm"#R3" \n\t" \
|
||||||
"psubw %%mm"#R0", %%mm"#R1" \n\t" \
|
"psubw %%mm"#R0", %%mm"#R1" \n\t" \
|
||||||
"punpcklbw %%mm0, %%mm"#R3" \n\t" \
|
"punpcklbw %%mm0, %%mm"#R3" \n\t" \
|
||||||
"paddw %%mm7, %%mm"#R1" \n\t" \
|
"paddw %%mm7, %%mm"#R1" \n\t" \
|
||||||
"psubw %%mm"#R3", %%mm"#R1" \n\t" \
|
"psubw %%mm"#R3", %%mm"#R1" \n\t" \
|
||||||
"psraw %5, %%mm"#R1" \n\t" \
|
"psraw %4, %%mm"#R1" \n\t" \
|
||||||
"movq %%mm"#R1", "#OFF"(%2) \n\t" \
|
"movq %%mm"#R1", "#OFF"(%1) \n\t" \
|
||||||
"add %3, %1 \n\t"
|
"add %2, %0 \n\t"
|
||||||
|
|
||||||
DECLARE_ALIGNED_16(static const uint64_t, fact_9) = 0x0009000900090009ULL;
|
DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
|
||||||
|
|
||||||
/** Sacrifying mm6 allows to pipeline loads from src */
|
/** Sacrifying mm6 allows to pipeline loads from src */
|
||||||
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
||||||
const uint8_t *src, long int stride,
|
const uint8_t *src, long int stride,
|
||||||
int rnd, int64_t shift)
|
int rnd, int64_t shift)
|
||||||
{
|
{
|
||||||
int w = 3;
|
|
||||||
|
|
||||||
asm volatile(
|
asm volatile(
|
||||||
LOAD_ROUNDER_MMX("%6")
|
"mov $3, %%"REG_c" \n\t"
|
||||||
"movq %7, %%mm6 \n\t"
|
LOAD_ROUNDER_MMX("%5")
|
||||||
|
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"movd (%1), %%mm2 \n\t"
|
"movd (%0), %%mm2 \n\t"
|
||||||
"add %3, %1 \n\t"
|
"add %2, %0 \n\t"
|
||||||
"movd (%1), %%mm3 \n\t"
|
"movd (%0), %%mm3 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm2 \n\t"
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm3 \n\t"
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
||||||
SHIFT2_LINE( 0, 1, 2, 3, 4)
|
SHIFT2_LINE( 0, 1, 2, 3, 4)
|
||||||
@ -93,14 +92,14 @@ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
|
|||||||
SHIFT2_LINE(120, 2, 3, 4, 1)
|
SHIFT2_LINE(120, 2, 3, 4, 1)
|
||||||
SHIFT2_LINE(144, 3, 4, 1, 2)
|
SHIFT2_LINE(144, 3, 4, 1, 2)
|
||||||
SHIFT2_LINE(168, 4, 1, 2, 3)
|
SHIFT2_LINE(168, 4, 1, 2, 3)
|
||||||
"sub %8, %1 \n\t"
|
"sub %6, %0 \n\t"
|
||||||
"add $8, %2 \n\t"
|
"add $8, %1 \n\t"
|
||||||
"decl %0 \n\t"
|
"dec %%"REG_c" \n\t"
|
||||||
"jnz 1b \n\t"
|
"jnz 1b \n\t"
|
||||||
: "+g"(w), "+r"(src), "+r"(dst)
|
: "+r"(src), "+r"(dst)
|
||||||
: "r"(stride), "r"(-2*stride), "m"(shift),
|
: "r"(stride), "r"(-2*stride),
|
||||||
"m"(rnd), "m"(fact_9), "g"(9*stride-4)
|
"m"(shift), "m"(rnd), "r"(9*stride-4)
|
||||||
: "memory"
|
: "%"REG_c, "memory"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -117,8 +116,8 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
|
|||||||
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */
|
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */
|
||||||
asm volatile(
|
asm volatile(
|
||||||
LOAD_ROUNDER_MMX("%4")
|
LOAD_ROUNDER_MMX("%4")
|
||||||
"movq %6, %%mm6 \n\t"
|
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"
|
||||||
"movq %5, %%mm5 \n\t"
|
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"movq 2*0+0(%1), %%mm1 \n\t"
|
"movq 2*0+0(%1), %%mm1 \n\t"
|
||||||
"movq 2*0+8(%1), %%mm2 \n\t"
|
"movq 2*0+8(%1), %%mm2 \n\t"
|
||||||
@ -141,8 +140,8 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
|
|||||||
"add %3, %2 \n\t"
|
"add %3, %2 \n\t"
|
||||||
"decl %0 \n\t"
|
"decl %0 \n\t"
|
||||||
"jnz 1b \n\t"
|
"jnz 1b \n\t"
|
||||||
: "+g"(h), "+r" (src), "+r" (dst)
|
: "+r"(h), "+r" (src), "+r" (dst)
|
||||||
: "g"(stride), "m"(rnd), "m"(fact_9), "m"(ff_pw_128)
|
: "r"(stride), "m"(rnd)
|
||||||
: "memory"
|
: "memory"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -155,48 +154,48 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
|
|||||||
static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
|
static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
|
||||||
long int stride, int rnd, long int offset)
|
long int stride, int rnd, long int offset)
|
||||||
{
|
{
|
||||||
int h = 8;
|
|
||||||
|
|
||||||
rnd = 8-rnd;
|
rnd = 8-rnd;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
LOAD_ROUNDER_MMX("%6")
|
"mov $8, %%"REG_c" \n\t"
|
||||||
"movq %8, %%mm6 \n\t"
|
LOAD_ROUNDER_MMX("%5")
|
||||||
|
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"
|
||||||
"1: \n\t"
|
"1: \n\t"
|
||||||
"movd 0(%1 ), %%mm3 \n\t"
|
"movd 0(%0 ), %%mm3 \n\t"
|
||||||
"movd 4(%1 ), %%mm4 \n\t"
|
"movd 4(%0 ), %%mm4 \n\t"
|
||||||
"movd 0(%1,%3), %%mm1 \n\t"
|
"movd 0(%0,%2), %%mm1 \n\t"
|
||||||
"movd 4(%1,%3), %%mm2 \n\t"
|
"movd 4(%0,%2), %%mm2 \n\t"
|
||||||
"add %3, %1 \n\t"
|
"add %2, %0 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm3 \n\t"
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm4 \n\t"
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm1 \n\t"
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm2 \n\t"
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
||||||
"paddw %%mm1, %%mm3 \n\t"
|
"paddw %%mm1, %%mm3 \n\t"
|
||||||
"paddw %%mm2, %%mm4 \n\t"
|
"paddw %%mm2, %%mm4 \n\t"
|
||||||
"movd 0(%1,%4), %%mm1 \n\t"
|
"movd 0(%0,%3), %%mm1 \n\t"
|
||||||
"movd 4(%1,%4), %%mm2 \n\t"
|
"movd 4(%0,%3), %%mm2 \n\t"
|
||||||
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/
|
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/
|
||||||
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/
|
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/
|
||||||
"punpcklbw %%mm0, %%mm1 \n\t"
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm2 \n\t"
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
||||||
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/
|
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/
|
||||||
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/
|
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/
|
||||||
"movd 0(%1,%3), %%mm1 \n\t"
|
"movd 0(%0,%2), %%mm1 \n\t"
|
||||||
"movd 4(%1,%3), %%mm2 \n\t"
|
"movd 4(%0,%2), %%mm2 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm1 \n\t"
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
||||||
"punpcklbw %%mm0, %%mm2 \n\t"
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
||||||
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/
|
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/
|
||||||
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/
|
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/
|
||||||
NORMALIZE_MMX("$4")
|
NORMALIZE_MMX("$4")
|
||||||
TRANSFER_DO_PACK
|
"packuswb %%mm4, %%mm3 \n\t"
|
||||||
"add %7, %1 \n\t"
|
"movq %%mm3, (%1) \n\t"
|
||||||
"add %5, %2 \n\t"
|
"add %6, %0 \n\t"
|
||||||
"decl %0 \n\t"
|
"add %4, %1 \n\t"
|
||||||
|
"dec %%"REG_c" \n\t"
|
||||||
"jnz 1b \n\t"
|
"jnz 1b \n\t"
|
||||||
: "+g"(h), "+r"(src), "+r"(dst)
|
: "+r"(src), "+r"(dst)
|
||||||
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),
|
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),
|
||||||
"g"(stride-offset), "m"(fact_9)
|
"g"(stride-offset)
|
||||||
: "memory"
|
: "%"REG_c, "memory"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -204,8 +203,8 @@ static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
|
|||||||
* Filter coefficients made global to allow access by all 1 or 3 quarter shift
|
* Filter coefficients made global to allow access by all 1 or 3 quarter shift
|
||||||
* interpolation functions.
|
* interpolation functions.
|
||||||
*/
|
*/
|
||||||
DECLARE_ALIGNED_16(static const uint64_t, fact_53) = 0x0035003500350035ULL;
|
DECLARE_ALIGNED_16(const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
|
||||||
DECLARE_ALIGNED_16(static const uint64_t, fact_18) = 0x0012001200120012ULL;
|
DECLARE_ALIGNED_16(const uint64_t, ff_pw_18) = 0x0012001200120012ULL;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Core of the 1/4 and 3/4 shift bicubic interpolation.
|
* Core of the 1/4 and 3/4 shift bicubic interpolation.
|
||||||
@ -217,13 +216,13 @@ DECLARE_ALIGNED_16(static const uint64_t, fact_18) = 0x0012001200120012ULL;
|
|||||||
* @param A3 Address of 3rd tap
|
* @param A3 Address of 3rd tap
|
||||||
* @param A4 Address of 4th tap
|
* @param A4 Address of 4th tap
|
||||||
*/
|
*/
|
||||||
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4, POS) \
|
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
|
||||||
MOVQ "*0+"A1", %%mm1 \n\t" \
|
MOVQ "*0+"A1", %%mm1 \n\t" \
|
||||||
MOVQ "*4+"A1", %%mm2 \n\t" \
|
MOVQ "*4+"A1", %%mm2 \n\t" \
|
||||||
UNPACK("%%mm1") \
|
UNPACK("%%mm1") \
|
||||||
UNPACK("%%mm2") \
|
UNPACK("%%mm2") \
|
||||||
"pmullw "POS", %%mm1 \n\t" \
|
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
|
||||||
"pmullw "POS", %%mm2 \n\t" \
|
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
|
||||||
MOVQ "*0+"A2", %%mm3 \n\t" \
|
MOVQ "*0+"A2", %%mm3 \n\t" \
|
||||||
MOVQ "*4+"A2", %%mm4 \n\t" \
|
MOVQ "*4+"A2", %%mm4 \n\t" \
|
||||||
UNPACK("%%mm3") \
|
UNPACK("%%mm3") \
|
||||||
@ -267,11 +266,11 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
|
|||||||
src -= src_stride; \
|
src -= src_stride; \
|
||||||
asm volatile( \
|
asm volatile( \
|
||||||
LOAD_ROUNDER_MMX("%5") \
|
LOAD_ROUNDER_MMX("%5") \
|
||||||
"movq %7, %%mm5 \n\t" \
|
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
|
||||||
"movq %8, %%mm6 \n\t" \
|
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
|
||||||
ASMALIGN(3) \
|
ASMALIGN(3) \
|
||||||
"1: \n\t" \
|
"1: \n\t" \
|
||||||
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4, "%9") \
|
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
||||||
NORMALIZE_MMX("%6") \
|
NORMALIZE_MMX("%6") \
|
||||||
TRANSFER_DONT_PACK \
|
TRANSFER_DONT_PACK \
|
||||||
/* Last 3 (in fact 4) bytes on the line */ \
|
/* Last 3 (in fact 4) bytes on the line */ \
|
||||||
@ -299,10 +298,9 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
|
|||||||
"add $24, %2 \n\t" \
|
"add $24, %2 \n\t" \
|
||||||
"decl %0 \n\t" \
|
"decl %0 \n\t" \
|
||||||
"jnz 1b \n\t" \
|
"jnz 1b \n\t" \
|
||||||
: "+g"(h), "+r" (src), "+r" (dst) \
|
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||||
: "r"(src_stride), "r"(3*src_stride), \
|
: "r"(src_stride), "r"(3*src_stride), \
|
||||||
"m"(rnd), "m"(shift), \
|
"m"(rnd), "m"(shift) \
|
||||||
"m"(fact_53), "m"(fact_18), "m"(ff_pw_3) \
|
|
||||||
: "memory" \
|
: "memory" \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
@ -324,23 +322,22 @@ vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, long int stride, \
|
|||||||
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
|
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
|
||||||
asm volatile( \
|
asm volatile( \
|
||||||
LOAD_ROUNDER_MMX("%4") \
|
LOAD_ROUNDER_MMX("%4") \
|
||||||
"movq %6, %%mm6 \n\t" \
|
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
|
||||||
"movq %5, %%mm5 \n\t" \
|
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
|
||||||
ASMALIGN(3) \
|
ASMALIGN(3) \
|
||||||
"1: \n\t" \
|
"1: \n\t" \
|
||||||
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4, "%8")\
|
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
|
||||||
NORMALIZE_MMX("$7") \
|
NORMALIZE_MMX("$7") \
|
||||||
/* Remove bias */ \
|
/* Remove bias */ \
|
||||||
"paddw %7, %%mm3 \n\t" \
|
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
|
||||||
"paddw %7, %%mm4 \n\t" \
|
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
|
||||||
TRANSFER_DO_PACK \
|
TRANSFER_DO_PACK \
|
||||||
"add $24, %1 \n\t" \
|
"add $24, %1 \n\t" \
|
||||||
"add %3, %2 \n\t" \
|
"add %3, %2 \n\t" \
|
||||||
"decl %0 \n\t" \
|
"decl %0 \n\t" \
|
||||||
"jnz 1b \n\t" \
|
"jnz 1b \n\t" \
|
||||||
: "+g"(h), "+r" (src), "+r" (dst) \
|
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||||
: "g"(stride), "m"(rnd), "m"(fact_53), "m"(fact_18), \
|
: "r"(stride), "m"(rnd) \
|
||||||
"m"(ff_pw_128), "m"(ff_pw_3) \
|
|
||||||
: "memory" \
|
: "memory" \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
@ -363,20 +360,19 @@ vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
|
|||||||
rnd = 32-rnd; \
|
rnd = 32-rnd; \
|
||||||
asm volatile ( \
|
asm volatile ( \
|
||||||
LOAD_ROUNDER_MMX("%6") \
|
LOAD_ROUNDER_MMX("%6") \
|
||||||
"movq %7, %%mm5 \n\t" \
|
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
|
||||||
"movq %8, %%mm6 \n\t" \
|
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
|
||||||
ASMALIGN(3) \
|
ASMALIGN(3) \
|
||||||
"1: \n\t" \
|
"1: \n\t" \
|
||||||
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4, "%9")\
|
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
|
||||||
NORMALIZE_MMX("$6") \
|
NORMALIZE_MMX("$6") \
|
||||||
TRANSFER_DO_PACK \
|
TRANSFER_DO_PACK \
|
||||||
"add %5, %1 \n\t" \
|
"add %5, %1 \n\t" \
|
||||||
"add %5, %2 \n\t" \
|
"add %5, %2 \n\t" \
|
||||||
"decl %0 \n\t" \
|
"decl %0 \n\t" \
|
||||||
"jnz 1b \n\t" \
|
"jnz 1b \n\t" \
|
||||||
: "+g"(h), "+r" (src), "+r" (dst) \
|
: "+r"(h), "+r" (src), "+r" (dst) \
|
||||||
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd), \
|
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
|
||||||
"m"(fact_53), "m"(fact_18), "m"(ff_pw_3) \
|
|
||||||
: "memory" \
|
: "memory" \
|
||||||
); \
|
); \
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user