summary of changes:
- Use MANGLE when loading some constants into MMX registers.
- Convert those constants to non-static and thus add ff_ prefix.
- Remove last parameter of MSPEL_FILTER13_CORE (was constant).
- Use of "+r" instead of stricter but unnecessary "+g".
- Use of REG_c and direct loading of some of the above.
patch by Christophe GISQUET, christophe.gisquet free fr
Subject: [FFmpeg-devel] [PATCH] Roundup issue #301
Date: Fri, 28 Dec 2007 19:22:18 +0100

Originally committed as revision 11376 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Christophe Gisquet 2008-01-02 19:24:42 +00:00 committed by Diego Biurrun
parent 8f8fae80b2
commit ae904fd028

View File

@ -55,34 +55,33 @@
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ #define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
"paddw %%mm"#R2", %%mm"#R1" \n\t" \ "paddw %%mm"#R2", %%mm"#R1" \n\t" \
"movd (%1,%4), %%mm"#R0" \n\t" \ "movd (%0,%3), %%mm"#R0" \n\t" \
"pmullw %%mm6, %%mm"#R1" \n\t" \ "pmullw %%mm6, %%mm"#R1" \n\t" \
"punpcklbw %%mm0, %%mm"#R0" \n\t" \ "punpcklbw %%mm0, %%mm"#R0" \n\t" \
"movd (%1,%3), %%mm"#R3" \n\t" \ "movd (%0,%2), %%mm"#R3" \n\t" \
"psubw %%mm"#R0", %%mm"#R1" \n\t" \ "psubw %%mm"#R0", %%mm"#R1" \n\t" \
"punpcklbw %%mm0, %%mm"#R3" \n\t" \ "punpcklbw %%mm0, %%mm"#R3" \n\t" \
"paddw %%mm7, %%mm"#R1" \n\t" \ "paddw %%mm7, %%mm"#R1" \n\t" \
"psubw %%mm"#R3", %%mm"#R1" \n\t" \ "psubw %%mm"#R3", %%mm"#R1" \n\t" \
"psraw %5, %%mm"#R1" \n\t" \ "psraw %4, %%mm"#R1" \n\t" \
"movq %%mm"#R1", "#OFF"(%2) \n\t" \ "movq %%mm"#R1", "#OFF"(%1) \n\t" \
"add %3, %1 \n\t" "add %2, %0 \n\t"
DECLARE_ALIGNED_16(static const uint64_t, fact_9) = 0x0009000900090009ULL; DECLARE_ALIGNED_16(const uint64_t, ff_pw_9) = 0x0009000900090009ULL;
/** Sacrifying mm6 allows to pipeline loads from src */ /** Sacrifying mm6 allows to pipeline loads from src */
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, long int stride, const uint8_t *src, long int stride,
int rnd, int64_t shift) int rnd, int64_t shift)
{ {
int w = 3;
asm volatile( asm volatile(
LOAD_ROUNDER_MMX("%6") "mov $3, %%"REG_c" \n\t"
"movq %7, %%mm6 \n\t" LOAD_ROUNDER_MMX("%5")
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
"1: \n\t" "1: \n\t"
"movd (%1), %%mm2 \n\t" "movd (%0), %%mm2 \n\t"
"add %3, %1 \n\t" "add %2, %0 \n\t"
"movd (%1), %%mm3 \n\t" "movd (%0), %%mm3 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t" "punpcklbw %%mm0, %%mm2 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t" "punpcklbw %%mm0, %%mm3 \n\t"
SHIFT2_LINE( 0, 1, 2, 3, 4) SHIFT2_LINE( 0, 1, 2, 3, 4)
@ -93,14 +92,14 @@ static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
SHIFT2_LINE(120, 2, 3, 4, 1) SHIFT2_LINE(120, 2, 3, 4, 1)
SHIFT2_LINE(144, 3, 4, 1, 2) SHIFT2_LINE(144, 3, 4, 1, 2)
SHIFT2_LINE(168, 4, 1, 2, 3) SHIFT2_LINE(168, 4, 1, 2, 3)
"sub %8, %1 \n\t" "sub %6, %0 \n\t"
"add $8, %2 \n\t" "add $8, %1 \n\t"
"decl %0 \n\t" "dec %%"REG_c" \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(w), "+r"(src), "+r"(dst) : "+r"(src), "+r"(dst)
: "r"(stride), "r"(-2*stride), "m"(shift), : "r"(stride), "r"(-2*stride),
"m"(rnd), "m"(fact_9), "g"(9*stride-4) "m"(shift), "m"(rnd), "r"(9*stride-4)
: "memory" : "%"REG_c, "memory"
); );
} }
@ -117,8 +116,8 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */
asm volatile( asm volatile(
LOAD_ROUNDER_MMX("%4") LOAD_ROUNDER_MMX("%4")
"movq %6, %%mm6 \n\t" "movq "MANGLE(ff_pw_128)", %%mm6\n\t"
"movq %5, %%mm5 \n\t" "movq "MANGLE(ff_pw_9)", %%mm5 \n\t"
"1: \n\t" "1: \n\t"
"movq 2*0+0(%1), %%mm1 \n\t" "movq 2*0+0(%1), %%mm1 \n\t"
"movq 2*0+8(%1), %%mm2 \n\t" "movq 2*0+8(%1), %%mm2 \n\t"
@ -141,8 +140,8 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
"add %3, %2 \n\t" "add %3, %2 \n\t"
"decl %0 \n\t" "decl %0 \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r" (src), "+r" (dst) : "+r"(h), "+r" (src), "+r" (dst)
: "g"(stride), "m"(rnd), "m"(fact_9), "m"(ff_pw_128) : "r"(stride), "m"(rnd)
: "memory" : "memory"
); );
} }
@ -155,48 +154,48 @@ static void vc1_put_hor_16b_shift2_mmx(uint8_t *dst, long int stride,
static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src, static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
long int stride, int rnd, long int offset) long int stride, int rnd, long int offset)
{ {
int h = 8;
rnd = 8-rnd; rnd = 8-rnd;
asm volatile( asm volatile(
LOAD_ROUNDER_MMX("%6") "mov $8, %%"REG_c" \n\t"
"movq %8, %%mm6 \n\t" LOAD_ROUNDER_MMX("%5")
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"
"1: \n\t" "1: \n\t"
"movd 0(%1 ), %%mm3 \n\t" "movd 0(%0 ), %%mm3 \n\t"
"movd 4(%1 ), %%mm4 \n\t" "movd 4(%0 ), %%mm4 \n\t"
"movd 0(%1,%3), %%mm1 \n\t" "movd 0(%0,%2), %%mm1 \n\t"
"movd 4(%1,%3), %%mm2 \n\t" "movd 4(%0,%2), %%mm2 \n\t"
"add %3, %1 \n\t" "add %2, %0 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t" "punpcklbw %%mm0, %%mm3 \n\t"
"punpcklbw %%mm0, %%mm4 \n\t" "punpcklbw %%mm0, %%mm4 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t" "punpcklbw %%mm0, %%mm2 \n\t"
"paddw %%mm1, %%mm3 \n\t" "paddw %%mm1, %%mm3 \n\t"
"paddw %%mm2, %%mm4 \n\t" "paddw %%mm2, %%mm4 \n\t"
"movd 0(%1,%4), %%mm1 \n\t" "movd 0(%0,%3), %%mm1 \n\t"
"movd 4(%1,%4), %%mm2 \n\t" "movd 4(%0,%3), %%mm2 \n\t"
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/ "pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/ "pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/
"punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t" "punpcklbw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/ "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/
"movd 0(%1,%3), %%mm1 \n\t" "movd 0(%0,%2), %%mm1 \n\t"
"movd 4(%1,%3), %%mm2 \n\t" "movd 4(%0,%2), %%mm2 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t" "punpcklbw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/ "psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/ "psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/
NORMALIZE_MMX("$4") NORMALIZE_MMX("$4")
TRANSFER_DO_PACK "packuswb %%mm4, %%mm3 \n\t"
"add %7, %1 \n\t" "movq %%mm3, (%1) \n\t"
"add %5, %2 \n\t" "add %6, %0 \n\t"
"decl %0 \n\t" "add %4, %1 \n\t"
"dec %%"REG_c" \n\t"
"jnz 1b \n\t" "jnz 1b \n\t"
: "+g"(h), "+r"(src), "+r"(dst) : "+r"(src), "+r"(dst)
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd), : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),
"g"(stride-offset), "m"(fact_9) "g"(stride-offset)
: "memory" : "%"REG_c, "memory"
); );
} }
@ -204,8 +203,8 @@ static void vc1_put_shift2_mmx(uint8_t *dst, const uint8_t *src,
* Filter coefficients made global to allow access by all 1 or 3 quarter shift * Filter coefficients made global to allow access by all 1 or 3 quarter shift
* interpolation functions. * interpolation functions.
*/ */
DECLARE_ALIGNED_16(static const uint64_t, fact_53) = 0x0035003500350035ULL; DECLARE_ALIGNED_16(const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ALIGNED_16(static const uint64_t, fact_18) = 0x0012001200120012ULL; DECLARE_ALIGNED_16(const uint64_t, ff_pw_18) = 0x0012001200120012ULL;
/** /**
* Core of the 1/4 and 3/4 shift bicubic interpolation. * Core of the 1/4 and 3/4 shift bicubic interpolation.
@ -217,13 +216,13 @@ DECLARE_ALIGNED_16(static const uint64_t, fact_18) = 0x0012001200120012ULL;
* @param A3 Address of 3rd tap * @param A3 Address of 3rd tap
* @param A4 Address of 4th tap * @param A4 Address of 4th tap
*/ */
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4, POS) \ #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
MOVQ "*0+"A1", %%mm1 \n\t" \ MOVQ "*0+"A1", %%mm1 \n\t" \
MOVQ "*4+"A1", %%mm2 \n\t" \ MOVQ "*4+"A1", %%mm2 \n\t" \
UNPACK("%%mm1") \ UNPACK("%%mm1") \
UNPACK("%%mm2") \ UNPACK("%%mm2") \
"pmullw "POS", %%mm1 \n\t" \ "pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
"pmullw "POS", %%mm2 \n\t" \ "pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
MOVQ "*0+"A2", %%mm3 \n\t" \ MOVQ "*0+"A2", %%mm3 \n\t" \
MOVQ "*4+"A2", %%mm4 \n\t" \ MOVQ "*4+"A2", %%mm4 \n\t" \
UNPACK("%%mm3") \ UNPACK("%%mm3") \
@ -267,11 +266,11 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
src -= src_stride; \ src -= src_stride; \
asm volatile( \ asm volatile( \
LOAD_ROUNDER_MMX("%5") \ LOAD_ROUNDER_MMX("%5") \
"movq %7, %%mm5 \n\t" \ "movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
"movq %8, %%mm6 \n\t" \ "movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
ASMALIGN(3) \ ASMALIGN(3) \
"1: \n\t" \ "1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4, "%9") \ MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("%6") \ NORMALIZE_MMX("%6") \
TRANSFER_DONT_PACK \ TRANSFER_DONT_PACK \
/* Last 3 (in fact 4) bytes on the line */ \ /* Last 3 (in fact 4) bytes on the line */ \
@ -299,10 +298,9 @@ vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
"add $24, %2 \n\t" \ "add $24, %2 \n\t" \
"decl %0 \n\t" \ "decl %0 \n\t" \
"jnz 1b \n\t" \ "jnz 1b \n\t" \
: "+g"(h), "+r" (src), "+r" (dst) \ : "+r"(h), "+r" (src), "+r" (dst) \
: "r"(src_stride), "r"(3*src_stride), \ : "r"(src_stride), "r"(3*src_stride), \
"m"(rnd), "m"(shift), \ "m"(rnd), "m"(shift) \
"m"(fact_53), "m"(fact_18), "m"(ff_pw_3) \
: "memory" \ : "memory" \
); \ ); \
} }
@ -324,23 +322,22 @@ vc1_put_hor_16b_ ## NAME ## _mmx(uint8_t *dst, long int stride, \
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
asm volatile( \ asm volatile( \
LOAD_ROUNDER_MMX("%4") \ LOAD_ROUNDER_MMX("%4") \
"movq %6, %%mm6 \n\t" \ "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
"movq %5, %%mm5 \n\t" \ "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
ASMALIGN(3) \ ASMALIGN(3) \
"1: \n\t" \ "1: \n\t" \
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4, "%8")\ MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
NORMALIZE_MMX("$7") \ NORMALIZE_MMX("$7") \
/* Remove bias */ \ /* Remove bias */ \
"paddw %7, %%mm3 \n\t" \ "paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
"paddw %7, %%mm4 \n\t" \ "paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
TRANSFER_DO_PACK \ TRANSFER_DO_PACK \
"add $24, %1 \n\t" \ "add $24, %1 \n\t" \
"add %3, %2 \n\t" \ "add %3, %2 \n\t" \
"decl %0 \n\t" \ "decl %0 \n\t" \
"jnz 1b \n\t" \ "jnz 1b \n\t" \
: "+g"(h), "+r" (src), "+r" (dst) \ : "+r"(h), "+r" (src), "+r" (dst) \
: "g"(stride), "m"(rnd), "m"(fact_53), "m"(fact_18), \ : "r"(stride), "m"(rnd) \
"m"(ff_pw_128), "m"(ff_pw_3) \
: "memory" \ : "memory" \
); \ ); \
} }
@ -363,20 +360,19 @@ vc1_put_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
rnd = 32-rnd; \ rnd = 32-rnd; \
asm volatile ( \ asm volatile ( \
LOAD_ROUNDER_MMX("%6") \ LOAD_ROUNDER_MMX("%6") \
"movq %7, %%mm5 \n\t" \ "movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
"movq %8, %%mm6 \n\t" \ "movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
ASMALIGN(3) \ ASMALIGN(3) \
"1: \n\t" \ "1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4, "%9")\ MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("$6") \ NORMALIZE_MMX("$6") \
TRANSFER_DO_PACK \ TRANSFER_DO_PACK \
"add %5, %1 \n\t" \ "add %5, %1 \n\t" \
"add %5, %2 \n\t" \ "add %5, %2 \n\t" \
"decl %0 \n\t" \ "decl %0 \n\t" \
"jnz 1b \n\t" \ "jnz 1b \n\t" \
: "+g"(h), "+r" (src), "+r" (dst) \ : "+r"(h), "+r" (src), "+r" (dst) \
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd), \ : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
"m"(fact_53), "m"(fact_18), "m"(ff_pw_3) \
: "memory" \ : "memory" \
); \ ); \
} }