Cleanup _t types in libavcodec/ppc

Originally committed as revision 16357 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
Luca Barbato 2008-12-27 11:21:28 +00:00
parent 7f9b3266c9
commit a6b4448cdf
5 changed files with 326 additions and 334 deletions

View File

@ -189,32 +189,32 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
((8 - x) * (y)), ((8 - x) * (y)),
((x) * (y))}; ((x) * (y))};
register int i; register int i;
vec_u8_t fperm; vec_u8 fperm;
const vec_s32_t vABCD = vec_ld(0, ABCD); const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
LOAD_ZERO; LOAD_ZERO;
const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4)); const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vec_u16_t v6us = vec_splat_u16(6); const vec_u16 v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vec_u8_t vsrc0uc, vsrc1uc; vec_u8 vsrc0uc, vsrc1uc;
vec_s16_t vsrc0ssH, vsrc1ssH; vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16_t vsrc2ssH, vsrc3ssH, psum; vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8_t vdst, ppsum, fsum; vec_u8 vdst, ppsum, fsum;
if (((unsigned long)dst) % 16 == 0) { if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13, fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17, 0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F}; 0x0C, 0x0D, 0x0E, 0x0F};
} else { } else {
fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03, fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B, 0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F}; 0x1C, 0x1D, 0x1E, 0x1F};
@ -233,8 +233,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
else else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc); vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc); vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
@ -245,8 +245,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
@ -256,7 +256,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
psum = vec_sra(psum, v6us); psum = vec_sra(psum, v6us);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vec_u8_t)vec_packsu(psum, psum); ppsum = (vec_u8)vec_packsu(psum, psum);
fsum = vec_perm(vdst, ppsum, fperm); fsum = vec_perm(vdst, ppsum, fperm);
vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
@ -268,7 +268,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
src += stride; src += stride;
} }
} else { } else {
vec_u8_t vsrcDuc; vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(stride + 16, src);
@ -279,8 +279,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
else else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc); vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc2uc);
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc); vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc3uc);
psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
@ -290,7 +290,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
psum = vec_sr(psum, v6us); psum = vec_sr(psum, v6us);
vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vec_u8_t)vec_pack(psum, psum); ppsum = (vec_u8)vec_pack(psum, psum);
fsum = vec_perm(vdst, ppsum, fperm); fsum = vec_perm(vdst, ppsum, fperm);
vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
@ -309,7 +309,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h) int src_stride1, int h)
{ {
int i; int i;
vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
mask_ = vec_lvsl(0, src2); mask_ = vec_lvsl(0, src2);
@ -351,7 +351,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h) int src_stride1, int h)
{ {
int i; int i;
vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align; vec_u8 a, b, d, tmp1, tmp2, mask, mask_, edges, align;
mask_ = vec_lvsl(0, src2); mask_ = vec_lvsl(0, src2);
@ -432,23 +432,23 @@ H264_MC(avg_, 16, altivec)
#define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \ #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
vdst_orig = vec_ld(0, dst); \ vdst_orig = vec_ld(0, dst); \
vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \ vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \ vdst_ss = (vec_s16) vec_mergeh(zero_u8v, vdst); \
va = vec_add(va, vdst_ss); \ va = vec_add(va, vdst_ss); \
va_u8 = vec_packsu(va, zero_s16v); \ va_u8 = vec_packsu(va, zero_s16v); \
va_u32 = vec_splat((vec_u32_t)va_u8, 0); \ va_u32 = vec_splat((vec_u32)va_u8, 0); \
vec_ste(va_u32, element, (uint32_t*)dst); vec_ste(va_u32, element, (uint32_t*)dst);
static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride) static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
{ {
vec_s16_t va0, va1, va2, va3; vec_s16 va0, va1, va2, va3;
vec_s16_t vz0, vz1, vz2, vz3; vec_s16 vz0, vz1, vz2, vz3;
vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3; vec_s16 vtmp0, vtmp1, vtmp2, vtmp3;
vec_u8_t va_u8; vec_u8 va_u8;
vec_u32_t va_u32; vec_u32 va_u32;
vec_s16_t vdst_ss; vec_s16 vdst_ss;
const vec_u16_t v6us = vec_splat_u16(6); const vec_u16 v6us = vec_splat_u16(6);
vec_u8_t vdst, vdst_orig; vec_u8 vdst, vdst_orig;
vec_u8_t vdst_mask = vec_lvsl(0, dst); vec_u8 vdst_mask = vec_lvsl(0, dst);
int element = ((unsigned long)dst & 0xf) >> 2; int element = ((unsigned long)dst & 0xf) >> 2;
LOAD_ZERO; LOAD_ZERO;
@ -479,40 +479,40 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
#define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\ #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
/* a0 = SRC(0) + SRC(4); */ \ /* a0 = SRC(0) + SRC(4); */ \
vec_s16_t a0v = vec_add(s0, s4); \ vec_s16 a0v = vec_add(s0, s4); \
/* a2 = SRC(0) - SRC(4); */ \ /* a2 = SRC(0) - SRC(4); */ \
vec_s16_t a2v = vec_sub(s0, s4); \ vec_s16 a2v = vec_sub(s0, s4); \
/* a4 = (SRC(2)>>1) - SRC(6); */ \ /* a4 = (SRC(2)>>1) - SRC(6); */ \
vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \ vec_s16 a4v = vec_sub(vec_sra(s2, onev), s6); \
/* a6 = (SRC(6)>>1) + SRC(2); */ \ /* a6 = (SRC(6)>>1) + SRC(2); */ \
vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \ vec_s16 a6v = vec_add(vec_sra(s6, onev), s2); \
/* b0 = a0 + a6; */ \ /* b0 = a0 + a6; */ \
vec_s16_t b0v = vec_add(a0v, a6v); \ vec_s16 b0v = vec_add(a0v, a6v); \
/* b2 = a2 + a4; */ \ /* b2 = a2 + a4; */ \
vec_s16_t b2v = vec_add(a2v, a4v); \ vec_s16 b2v = vec_add(a2v, a4v); \
/* b4 = a2 - a4; */ \ /* b4 = a2 - a4; */ \
vec_s16_t b4v = vec_sub(a2v, a4v); \ vec_s16 b4v = vec_sub(a2v, a4v); \
/* b6 = a0 - a6; */ \ /* b6 = a0 - a6; */ \
vec_s16_t b6v = vec_sub(a0v, a6v); \ vec_s16 b6v = vec_sub(a0v, a6v); \
/* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \ /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
/* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \ /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \ vec_s16 a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
/* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \ /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
/* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \ /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\ vec_s16 a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
/* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \ /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
/* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \ /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\ vec_s16 a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
/* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \ /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\ vec_s16 a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
/* b1 = (a7>>2) + a1; */ \ /* b1 = (a7>>2) + a1; */ \
vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \ vec_s16 b1v = vec_add( vec_sra(a7v, twov), a1v); \
/* b3 = a3 + (a5>>2); */ \ /* b3 = a3 + (a5>>2); */ \
vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \ vec_s16 b3v = vec_add(a3v, vec_sra(a5v, twov)); \
/* b5 = (a3>>2) - a5; */ \ /* b5 = (a3>>2) - a5; */ \
vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \ vec_s16 b5v = vec_sub( vec_sra(a3v, twov), a5v); \
/* b7 = a7 - (a1>>2); */ \ /* b7 = a7 - (a1>>2); */ \
vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \ vec_s16 b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
/* DST(0, b0 + b7); */ \ /* DST(0, b0 + b7); */ \
d0 = vec_add(b0v, b7v); \ d0 = vec_add(b0v, b7v); \
/* DST(1, b2 + b5); */ \ /* DST(1, b2 + b5); */ \
@ -533,17 +533,17 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
#define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \ #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
/* unaligned load */ \ /* unaligned load */ \
vec_u8_t hv = vec_ld( 0, dest ); \ vec_u8 hv = vec_ld( 0, dest ); \
vec_u8_t lv = vec_ld( 7, dest ); \ vec_u8 lv = vec_ld( 7, dest ); \
vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \ vec_u8 dstv = vec_perm( hv, lv, (vec_u8)perm_ldv ); \
vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \ vec_s16 idct_sh6 = vec_sra(idctv, sixv); \
vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \ vec_u16 dst16 = (vec_u16)vec_mergeh(zero_u8v, dstv); \
vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \ vec_s16 idstsum = vec_adds(idct_sh6, (vec_s16)dst16); \
vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \ vec_u8 idstsum8 = vec_packsu(zero_s16v, idstsum); \
vec_u8_t edgehv; \ vec_u8 edgehv; \
/* unaligned store */ \ /* unaligned store */ \
vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\ vec_u8 bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \ vec_u8 edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
lv = vec_sel( lv, bodyv, edgelv ); \ lv = vec_sel( lv, bodyv, edgelv ); \
vec_st( lv, 7, dest ); \ vec_st( lv, 7, dest ); \
hv = vec_ld( 0, dest ); \ hv = vec_ld( 0, dest ); \
@ -553,18 +553,18 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
} }
void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) { void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7; vec_s16 s0, s1, s2, s3, s4, s5, s6, s7;
vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7; vec_s16 d0, d1, d2, d3, d4, d5, d6, d7;
vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7; vec_s16 idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
vec_u8_t perm_ldv = vec_lvsl(0, dst); vec_u8 perm_ldv = vec_lvsl(0, dst);
vec_u8_t perm_stv = vec_lvsr(8, dst); vec_u8 perm_stv = vec_lvsr(8, dst);
const vec_u16_t onev = vec_splat_u16(1); const vec_u16 onev = vec_splat_u16(1);
const vec_u16_t twov = vec_splat_u16(2); const vec_u16 twov = vec_splat_u16(2);
const vec_u16_t sixv = vec_splat_u16(6); const vec_u16 sixv = vec_splat_u16(6);
const vec_u8_t sel = (vec_u8_t) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1}; const vec_u8 sel = (vec_u8) {0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1};
LOAD_ZERO; LOAD_ZERO;
dct[0] += 32; // rounding for the >>6 at the end dct[0] += 32; // rounding for the >>6 at the end
@ -621,10 +621,10 @@ static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DC
} }
#define transpose4x16(r0, r1, r2, r3) { \ #define transpose4x16(r0, r1, r2, r3) { \
register vec_u8_t r4; \ register vec_u8 r4; \
register vec_u8_t r5; \ register vec_u8 r5; \
register vec_u8_t r6; \ register vec_u8 r6; \
register vec_u8_t r7; \ register vec_u8 r7; \
\ \
r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
@ -638,8 +638,8 @@ static void ff_h264_idct8_add4_altivec(uint8_t *dst, const int *block_offset, DC
} }
static inline void write16x4(uint8_t *dst, int dst_stride, static inline void write16x4(uint8_t *dst, int dst_stride,
register vec_u8_t r0, register vec_u8_t r1, register vec_u8 r0, register vec_u8 r1,
register vec_u8_t r2, register vec_u8_t r3) { register vec_u8 r2, register vec_u8 r3) {
DECLARE_ALIGNED_16(unsigned char, result[64]); DECLARE_ALIGNED_16(unsigned char, result[64]);
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4; int int_dst_stride = dst_stride/4;
@ -671,16 +671,16 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
\todo FIXME: see if we can't spare some vec_lvsl() by them factorizing \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
out of unaligned_load() */ out of unaligned_load() */
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
register vec_u8_t r0 = unaligned_load(0, src); \ register vec_u8 r0 = unaligned_load(0, src); \
register vec_u8_t r1 = unaligned_load( src_stride, src); \ register vec_u8 r1 = unaligned_load( src_stride, src); \
register vec_u8_t r2 = unaligned_load(2* src_stride, src); \ register vec_u8 r2 = unaligned_load(2* src_stride, src); \
register vec_u8_t r3 = unaligned_load(3* src_stride, src); \ register vec_u8 r3 = unaligned_load(3* src_stride, src); \
register vec_u8_t r4 = unaligned_load(4* src_stride, src); \ register vec_u8 r4 = unaligned_load(4* src_stride, src); \
register vec_u8_t r5 = unaligned_load(5* src_stride, src); \ register vec_u8 r5 = unaligned_load(5* src_stride, src); \
register vec_u8_t r6 = unaligned_load(6* src_stride, src); \ register vec_u8 r6 = unaligned_load(6* src_stride, src); \
register vec_u8_t r7 = unaligned_load(7* src_stride, src); \ register vec_u8 r7 = unaligned_load(7* src_stride, src); \
register vec_u8_t r14 = unaligned_load(14*src_stride, src); \ register vec_u8 r14 = unaligned_load(14*src_stride, src); \
register vec_u8_t r15 = unaligned_load(15*src_stride, src); \ register vec_u8 r15 = unaligned_load(15*src_stride, src); \
\ \
r8 = unaligned_load( 8*src_stride, src); \ r8 = unaligned_load( 8*src_stride, src); \
r9 = unaligned_load( 9*src_stride, src); \ r9 = unaligned_load( 9*src_stride, src); \
@ -730,26 +730,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
} }
// out: o = |x-y| < a // out: o = |x-y| < a
static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x, static inline vec_u8 diff_lt_altivec ( register vec_u8 x,
register vec_u8_t y, register vec_u8 y,
register vec_u8_t a) { register vec_u8 a) {
register vec_u8_t diff = vec_subs(x, y); register vec_u8 diff = vec_subs(x, y);
register vec_u8_t diffneg = vec_subs(y, x); register vec_u8 diffneg = vec_subs(y, x);
register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ register vec_u8 o = vec_or(diff, diffneg); /* |x-y| */
o = (vec_u8_t)vec_cmplt(o, a); o = (vec_u8)vec_cmplt(o, a);
return o; return o;
} }
static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0, static inline vec_u8 h264_deblock_mask ( register vec_u8 p0,
register vec_u8_t p1, register vec_u8 p1,
register vec_u8_t q0, register vec_u8 q0,
register vec_u8_t q1, register vec_u8 q1,
register vec_u8_t alpha, register vec_u8 alpha,
register vec_u8_t beta) { register vec_u8 beta) {
register vec_u8_t mask; register vec_u8 mask;
register vec_u8_t tempmask; register vec_u8 tempmask;
mask = diff_lt_altivec(p0, q0, alpha); mask = diff_lt_altivec(p0, q0, alpha);
tempmask = diff_lt_altivec(p1, p0, beta); tempmask = diff_lt_altivec(p1, p0, beta);
@ -761,19 +761,19 @@ static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
} }
// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0, static inline vec_u8 h264_deblock_q1(register vec_u8 p0,
register vec_u8_t p1, register vec_u8 p1,
register vec_u8_t p2, register vec_u8 p2,
register vec_u8_t q0, register vec_u8 q0,
register vec_u8_t tc0) { register vec_u8 tc0) {
register vec_u8_t average = vec_avg(p0, q0); register vec_u8 average = vec_avg(p0, q0);
register vec_u8_t temp; register vec_u8 temp;
register vec_u8_t uncliped; register vec_u8 uncliped;
register vec_u8_t ones; register vec_u8 ones;
register vec_u8_t max; register vec_u8 max;
register vec_u8_t min; register vec_u8 min;
register vec_u8_t newp1; register vec_u8 newp1;
temp = vec_xor(average, p2); temp = vec_xor(average, p2);
average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
@ -789,16 +789,16 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
\ \
const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \ const vec_u8 A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
\ \
register vec_u8_t pq0bit = vec_xor(p0,q0); \ register vec_u8 pq0bit = vec_xor(p0,q0); \
register vec_u8_t q1minus; \ register vec_u8 q1minus; \
register vec_u8_t p0minus; \ register vec_u8 p0minus; \
register vec_u8_t stage1; \ register vec_u8 stage1; \
register vec_u8_t stage2; \ register vec_u8 stage2; \
register vec_u8_t vec160; \ register vec_u8 vec160; \
register vec_u8_t delta; \ register vec_u8 delta; \
register vec_u8_t deltaneg; \ register vec_u8 deltaneg; \
\ \
q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
@ -821,16 +821,16 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
DECLARE_ALIGNED_16(unsigned char, temp[16]); \ DECLARE_ALIGNED_16(unsigned char, temp[16]); \
register vec_u8_t alphavec; \ register vec_u8 alphavec; \
register vec_u8_t betavec; \ register vec_u8 betavec; \
register vec_u8_t mask; \ register vec_u8 mask; \
register vec_u8_t p1mask; \ register vec_u8 p1mask; \
register vec_u8_t q1mask; \ register vec_u8 q1mask; \
register vector signed char tc0vec; \ register vector signed char tc0vec; \
register vec_u8_t finaltc0; \ register vec_u8 finaltc0; \
register vec_u8_t tc0masked; \ register vec_u8 tc0masked; \
register vec_u8_t newp1; \ register vec_u8 newp1; \
register vec_u8_t newq1; \ register vec_u8 newq1; \
\ \
temp[0] = alpha; \ temp[0] = alpha; \
temp[1] = beta; \ temp[1] = beta; \
@ -844,18 +844,18 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \
tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \
mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \ finaltc0 = vec_and((vec_u8)tc0vec, mask); /* tc = tc0 */ \
\ \
p1mask = diff_lt_altivec(p2, p0, betavec); \ p1mask = diff_lt_altivec(p2, p0, betavec); \
p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \ p1mask = vec_and(p1mask, mask); /* if ( |p2 - p0| < beta) */ \
tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \ tc0masked = vec_and(p1mask, (vec_u8)tc0vec); \
finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
/*end if*/ \ /*end if*/ \
\ \
q1mask = diff_lt_altivec(q2, q0, betavec); \ q1mask = diff_lt_altivec(q2, q0, betavec); \
q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \ tc0masked = vec_and(q1mask, (vec_u8)tc0vec); \
finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
/*end if*/ \ /*end if*/ \
@ -868,12 +868,12 @@ static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
register vec_u8_t p2 = vec_ld(-3*stride, pix); register vec_u8 p2 = vec_ld(-3*stride, pix);
register vec_u8_t p1 = vec_ld(-2*stride, pix); register vec_u8 p1 = vec_ld(-2*stride, pix);
register vec_u8_t p0 = vec_ld(-1*stride, pix); register vec_u8 p0 = vec_ld(-1*stride, pix);
register vec_u8_t q0 = vec_ld(0, pix); register vec_u8 q0 = vec_ld(0, pix);
register vec_u8_t q1 = vec_ld(stride, pix); register vec_u8 q1 = vec_ld(stride, pix);
register vec_u8_t q2 = vec_ld(2*stride, pix); register vec_u8 q2 = vec_ld(2*stride, pix);
h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
vec_st(p1, -2*stride, pix); vec_st(p1, -2*stride, pix);
vec_st(p0, -1*stride, pix); vec_st(p0, -1*stride, pix);
@ -884,7 +884,7 @@ static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
register vec_u8_t line0, line1, line2, line3, line4, line5; register vec_u8 line0, line1, line2, line3, line4, line5;
if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) if ((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
return; return;
readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);

View File

@ -28,8 +28,8 @@
/* this code assume that stride % 16 == 0 */ /* this code assume that stride % 16 == 0 */
#define CHROMA_MC8_ALTIVEC_CORE \ #define CHROMA_MC8_ALTIVEC_CORE \
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\ vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\ vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
\ \
psum = vec_mladd(vA, vsrc0ssH, v32ss);\ psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vB, vsrc1ssH, psum);\ psum = vec_mladd(vB, vsrc1ssH, psum);\
@ -38,7 +38,7 @@
psum = vec_sr(psum, v6us);\ psum = vec_sr(psum, v6us);\
\ \
vdst = vec_ld(0, dst);\ vdst = vec_ld(0, dst);\
ppsum = (vec_u8_t)vec_pack(psum, psum);\ ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\ vfdst = vec_perm(vdst, ppsum, fperm);\
\ \
OP_U8_ALTIVEC(fsum, vfdst, vdst);\ OP_U8_ALTIVEC(fsum, vfdst, vdst);\
@ -53,15 +53,15 @@
#define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \ #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
\ \
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\ vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\ vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
\ \
psum = vec_mladd(vA, vsrc0ssH, v32ss);\ psum = vec_mladd(vA, vsrc0ssH, v32ss);\
psum = vec_mladd(vE, vsrc1ssH, psum);\ psum = vec_mladd(vE, vsrc1ssH, psum);\
psum = vec_sr(psum, v6us);\ psum = vec_sr(psum, v6us);\
\ \
vdst = vec_ld(0, dst);\ vdst = vec_ld(0, dst);\
ppsum = (vec_u8_t)vec_pack(psum, psum);\ ppsum = (vec_u8)vec_pack(psum, psum);\
vfdst = vec_perm(vdst, ppsum, fperm);\ vfdst = vec_perm(vdst, ppsum, fperm);\
\ \
OP_U8_ALTIVEC(fsum, vfdst, vdst);\ OP_U8_ALTIVEC(fsum, vfdst, vdst);\
@ -80,34 +80,34 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
((8 - x) * ( y)), ((8 - x) * ( y)),
(( x) * ( y))}; (( x) * ( y))};
register int i; register int i;
vec_u8_t fperm; vec_u8 fperm;
const vec_s32_t vABCD = vec_ld(0, ABCD); const vec_s32 vABCD = vec_ld(0, ABCD);
const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1); const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3); const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5); const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7); const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
LOAD_ZERO; LOAD_ZERO;
const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5)); const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vec_u16_t v6us = vec_splat_u16(6); const vec_u16 v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1; vec_u8 vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vec_u8_t vsrc0uc, vsrc1uc; vec_u8 vsrc0uc, vsrc1uc;
vec_s16_t vsrc0ssH, vsrc1ssH; vec_s16 vsrc0ssH, vsrc1ssH;
vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc; vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16_t vsrc2ssH, vsrc3ssH, psum; vec_s16 vsrc2ssH, vsrc3ssH, psum;
vec_u8_t vdst, ppsum, vfdst, fsum; vec_u8 vdst, ppsum, vfdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
if (((unsigned long)dst) % 16 == 0) { if (((unsigned long)dst) % 16 == 0) {
fperm = (vec_u8_t){0x10, 0x11, 0x12, 0x13, fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17, 0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F}; 0x0C, 0x0D, 0x0E, 0x0F};
} else { } else {
fperm = (vec_u8_t){0x00, 0x01, 0x02, 0x03, fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B, 0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F}; 0x1C, 0x1D, 0x1E, 0x1F};
@ -126,8 +126,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
else else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc); vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc); vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
if (ABCD[3]) { if (ABCD[3]) {
if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
@ -139,7 +139,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
CHROMA_MC8_ALTIVEC_CORE CHROMA_MC8_ALTIVEC_CORE
} }
} else { } else {
vec_u8_t vsrcDuc; vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(stride + 16, src);
@ -153,7 +153,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
} }
} }
} else { } else {
const vec_s16_t vE = vec_add(vB, vC); const vec_s16 vE = vec_add(vB, vC);
if (ABCD[2]) { // x == 0 B == 0 if (ABCD[2]) { // x == 0 B == 0
if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
@ -164,7 +164,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
vsrc0uc = vsrc1uc; vsrc0uc = vsrc1uc;
} }
} else { } else {
vec_u8_t vsrcDuc; vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 15, src); vsrcDuc = vec_ld(stride + 15, src);
@ -184,7 +184,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
CHROMA_MC8_ALTIVEC_CORE_SIMPLE CHROMA_MC8_ALTIVEC_CORE_SIMPLE
} }
} else { } else {
vec_u8_t vsrcDuc; vec_u8 vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(0, src); vsrcCuc = vec_ld(0, src);
vsrcDuc = vec_ld(15, src); vsrcDuc = vec_ld(15, src);
@ -210,35 +210,35 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
register int i; register int i;
LOAD_ZERO; LOAD_ZERO;
const vec_u8_t permM2 = vec_lvsl(-2, src); const vec_u8 permM2 = vec_lvsl(-2, src);
const vec_u8_t permM1 = vec_lvsl(-1, src); const vec_u8 permM1 = vec_lvsl(-1, src);
const vec_u8_t permP0 = vec_lvsl(+0, src); const vec_u8 permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src); const vec_u8 permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src); const vec_u8 permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src); const vec_u8 permP3 = vec_lvsl(+3, src);
const vec_s16_t v5ss = vec_splat_s16(5); const vec_s16 v5ss = vec_splat_s16(5);
const vec_u16_t v5us = vec_splat_u16(5); const vec_u16 v5us = vec_splat_u16(5);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB; psumA, psumB, sumA, sumB;
vec_u8_t sum, vdst, fsum; vec_u8 sum, vdst, fsum;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
for (i = 0 ; i < 16 ; i ++) { for (i = 0 ; i < 16 ; i ++) {
vec_u8_t srcR1 = vec_ld(-2, src); vec_u8 srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src); vec_u8 srcR2 = vec_ld(14, src);
switch (align) { switch (align) {
default: { default: {
@ -258,7 +258,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@ -267,7 +267,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@ -276,7 +276,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
@ -285,7 +285,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
@ -295,20 +295,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
} break; } break;
} }
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A); sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B); sum1B = vec_adds(srcP0B, srcP1B);
@ -354,52 +354,52 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
register int i; register int i;
LOAD_ZERO; LOAD_ZERO;
const vec_u8_t perm = vec_lvsl(0, src); const vec_u8 perm = vec_lvsl(0, src);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u16_t v5us = vec_splat_u16(5); const vec_u16 v5us = vec_splat_u16(5);
const vec_s16_t v5ss = vec_splat_s16(5); const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4)); const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
uint8_t *srcbis = src - (srcStride * 2); uint8_t *srcbis = src - (srcStride * 2);
const vec_u8_t srcM2a = vec_ld(0, srcbis); const vec_u8 srcM2a = vec_ld(0, srcbis);
const vec_u8_t srcM2b = vec_ld(16, srcbis); const vec_u8 srcM2b = vec_ld(16, srcbis);
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm); const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
//srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride); const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcM1b = vec_ld(16, srcbis); const vec_u8 srcM1b = vec_ld(16, srcbis);
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm); const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
//srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP0b = vec_ld(16, srcbis); const vec_u8 srcP0b = vec_ld(16, srcbis);
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm); const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
//srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP1b = vec_ld(16, srcbis); const vec_u8 srcP1b = vec_ld(16, srcbis);
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm); const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
//srcbis += srcStride; //srcbis += srcStride;
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride); const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP2b = vec_ld(16, srcbis); const vec_u8 srcP2b = vec_ld(16, srcbis);
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm); const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
//srcbis += srcStride; //srcbis += srcStride;
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2); vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1); vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0); vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1); vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2); vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB, psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB, srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
vec_u8_t sum, vdst, fsum, srcP3a, srcP3b, srcP3; vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
@ -407,8 +407,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3a = vec_ld(0, srcbis += srcStride); srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis); srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm); srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3); srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
//srcbis += srcStride; //srcbis += srcStride;
sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1A = vec_adds(srcP0ssA, srcP1ssA);
@ -463,49 +463,49 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i; register int i;
LOAD_ZERO; LOAD_ZERO;
const vec_u8_t permM2 = vec_lvsl(-2, src); const vec_u8 permM2 = vec_lvsl(-2, src);
const vec_u8_t permM1 = vec_lvsl(-1, src); const vec_u8 permM1 = vec_lvsl(-1, src);
const vec_u8_t permP0 = vec_lvsl(+0, src); const vec_u8 permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src); const vec_u8 permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src); const vec_u8 permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src); const vec_u8 permP3 = vec_lvsl(+3, src);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2)); const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u32_t v10ui = vec_splat_u32(10); const vec_u32 v10ui = vec_splat_u32(10);
const vec_s16_t v5ss = vec_splat_s16(5); const vec_s16 v5ss = vec_splat_s16(5);
const vec_s16_t v1ss = vec_splat_s16(1); const vec_s16 v1ss = vec_splat_s16(1);
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9)); const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4)); const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B, vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB; pp1A, pp1B, pp2A, pp2B, psumA, psumB;
const vec_u8_t mperm = (const vec_u8_t) const vec_u8 mperm = (const vec_u8)
{0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F}; 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
int16_t *tmpbis = tmp; int16_t *tmpbis = tmp;
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB, vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB; tmpP2ssA, tmpP2ssB;
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo, vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo; ssumAe, ssumAo, ssumBe, ssumBo;
vec_u8_t fsum, sumv, sum, vdst; vec_u8 fsum, sumv, sum, vdst;
vec_s16_t ssume, ssumo; vec_s16 ssume, ssumo;
POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride); src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) { for (i = 0 ; i < 21 ; i ++) {
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3; vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vec_u8_t srcR1 = vec_ld(-2, src); vec_u8 srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src); vec_u8 srcR2 = vec_ld(14, src);
switch (align) { switch (align) {
default: { default: {
@ -525,7 +525,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@ -534,7 +534,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@ -543,7 +543,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
@ -552,7 +552,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vec_u8_t srcR3 = vec_ld(30, src); vec_u8 srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
@ -562,20 +562,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
} break; } break;
} }
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0); srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0); srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1); srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1); srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2); srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2); srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3); srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3); srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1); srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1); srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2); srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2); srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
sum1A = vec_adds(srcP0A, srcP1A); sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B); sum1B = vec_adds(srcP0B, srcP1B);
@ -617,15 +617,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
tmpbis += tmpStride; tmpbis += tmpStride;
for (i = 0 ; i < 16 ; i++) { for (i = 0 ; i < 16 ; i++) {
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis); const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis); const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA); const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB); const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA); const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB); const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA); const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB); const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
tmpbis += tmpStride; tmpbis += tmpStride;
@ -650,9 +650,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp2Be = vec_mule(sum2B, v5ss); pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss); pp2Bo = vec_mulo(sum2B, v5ss);
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui); pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss); pp3Ao = vec_mulo(sum3A, v1ss);
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui); pp3Be = vec_sra((vec_s32)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss); pp3Bo = vec_mulo(sum3B, v1ss);
pp1cAe = vec_add(pp1Ae, v512si); pp1cAe = vec_add(pp1Ae, v512si);

View File

@ -40,17 +40,9 @@
#include "libavcodec/dsputil.h" #include "libavcodec/dsputil.h"
#include "gcc_fixes.h" #include "gcc_fixes.h"
#include "types_altivec.h"
#include "dsputil_ppc.h" #include "dsputil_ppc.h"
#define vector_s16_t vector signed short
#define const_vector_s16_t const vector signed short
#define vector_u16_t vector unsigned short
#define vector_s8_t vector signed char
#define vector_u8_t vector unsigned char
#define vector_s32_t vector signed int
#define vector_u32_t vector unsigned int
#define IDCT_HALF \ #define IDCT_HALF \
/* 1st stage */ \ /* 1st stage */ \
t1 = vec_mradds (a1, vx7, vx1 ); \ t1 = vec_mradds (a1, vx7, vx1 ); \
@ -88,11 +80,11 @@
#define IDCT \ #define IDCT \
vector_s16_t vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \ vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
vector_s16_t vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \ vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
vector_s16_t a0, a1, a2, ma2, c4, mc4, zero, bias; \ vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
vector_s16_t t0, t1, t2, t3, t4, t5, t6, t7, t8; \ vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
vector_u16_t shift; \ vec_u16 shift; \
\ \
c4 = vec_splat (constants[0], 0); \ c4 = vec_splat (constants[0], 0); \
a0 = vec_splat (constants[0], 1); \ a0 = vec_splat (constants[0], 1); \
@ -100,7 +92,7 @@
a2 = vec_splat (constants[0], 3); \ a2 = vec_splat (constants[0], 3); \
mc4 = vec_splat (constants[0], 4); \ mc4 = vec_splat (constants[0], 4); \
ma2 = vec_splat (constants[0], 5); \ ma2 = vec_splat (constants[0], 5); \
bias = (vector_s16_t)vec_splat ((vector_s32_t)constants[0], 3); \ bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
\ \
zero = vec_splat_s16 (0); \ zero = vec_splat_s16 (0); \
shift = vec_splat_u16 (4); \ shift = vec_splat_u16 (4); \
@ -156,7 +148,7 @@
vx7 = vec_sra (vy7, shift); vx7 = vec_sra (vy7, shift);
static const_vector_s16_t constants[5] = { static const vec_s16 constants[5] = {
{23170, 13573, 6518, 21895, -23170, -21895, 32, 31}, {23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725}, {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521}, {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
@ -164,10 +156,10 @@ static const_vector_s16_t constants[5] = {
{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722} {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
}; };
void idct_put_altivec(uint8_t* dest, int stride, vector_s16_t* block) void idct_put_altivec(uint8_t* dest, int stride, vec_s16* block)
{ {
POWERPC_PERF_DECLARE(altivec_idct_put_num, 1); POWERPC_PERF_DECLARE(altivec_idct_put_num, 1);
vector_u8_t tmp; vec_u8 tmp;
#ifdef CONFIG_POWERPC_PERF #ifdef CONFIG_POWERPC_PERF
POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1); POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
@ -176,8 +168,8 @@ POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
#define COPY(dest,src) \ #define COPY(dest,src) \
tmp = vec_packsu (src, src); \ tmp = vec_packsu (src, src); \
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
COPY (dest, vx0) dest += stride; COPY (dest, vx0) dest += stride;
COPY (dest, vx1) dest += stride; COPY (dest, vx1) dest += stride;
@ -191,14 +183,14 @@ POWERPC_PERF_START_COUNT(altivec_idct_put_num, 1);
POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1); POWERPC_PERF_STOP_COUNT(altivec_idct_put_num, 1);
} }
void idct_add_altivec(uint8_t* dest, int stride, vector_s16_t* block) void idct_add_altivec(uint8_t* dest, int stride, vec_s16* block)
{ {
POWERPC_PERF_DECLARE(altivec_idct_add_num, 1); POWERPC_PERF_DECLARE(altivec_idct_add_num, 1);
vector_u8_t tmp; vec_u8 tmp;
vector_s16_t tmp2, tmp3; vec_s16 tmp2, tmp3;
vector_u8_t perm0; vec_u8 perm0;
vector_u8_t perm1; vec_u8 perm1;
vector_u8_t p0, p1, p; vec_u8 p0, p1, p;
#ifdef CONFIG_POWERPC_PERF #ifdef CONFIG_POWERPC_PERF
POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1); POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
@ -215,11 +207,11 @@ POWERPC_PERF_START_COUNT(altivec_idct_add_num, 1);
#define ADD(dest,src,perm) \ #define ADD(dest,src,perm) \
/* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \
tmp = vec_ld (0, dest); \ tmp = vec_ld (0, dest); \
tmp2 = (vector_s16_t)vec_perm (tmp, (vector_u8_t)zero, perm); \ tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
tmp3 = vec_adds (tmp2, src); \ tmp3 = vec_adds (tmp2, src); \
tmp = vec_packsu (tmp3, tmp3); \ tmp = vec_packsu (tmp3, tmp3); \
vec_ste ((vector_u32_t)tmp, 0, (unsigned int *)dest); \ vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
vec_ste ((vector_u32_t)tmp, 4, (unsigned int *)dest); vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
ADD (dest, vx0, perm0) dest += stride; ADD (dest, vx0, perm0) dest += stride;
ADD (dest, vx1, perm1) dest += stride; ADD (dest, vx1, perm1) dest += stride;

View File

@ -79,10 +79,10 @@ static int ssd_int8_vs_int16_altivec(const int8_t *pix1, const int16_t *pix2,
static void add_int16_altivec(int16_t * v1, int16_t * v2, int order) static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
{ {
int i; int i;
register vec_s16_t vec, *pv; register vec_s16 vec, *pv;
for(i = 0; i < order; i += 8){ for(i = 0; i < order; i += 8){
pv = (vec_s16_t*)v2; pv = (vec_s16*)v2;
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
vec_st(vec_add(vec_ld(0, v1), vec), 0, v1); vec_st(vec_add(vec_ld(0, v1), vec), 0, v1);
v1 += 8; v1 += 8;
@ -93,10 +93,10 @@ static void add_int16_altivec(int16_t * v1, int16_t * v2, int order)
static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order) static void sub_int16_altivec(int16_t * v1, int16_t * v2, int order)
{ {
int i; int i;
register vec_s16_t vec, *pv; register vec_s16 vec, *pv;
for(i = 0; i < order; i += 8){ for(i = 0; i < order; i += 8){
pv = (vec_s16_t*)v2; pv = (vec_s16*)v2;
vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2)); vec = vec_perm(pv[0], pv[1], vec_lvsl(0, v2));
vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1); vec_st(vec_sub(vec_ld(0, v1), vec), 0, v1);
v1 += 8; v1 += 8;
@ -108,9 +108,9 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order
{ {
int i; int i;
LOAD_ZERO; LOAD_ZERO;
register vec_s16_t vec1, *pv; register vec_s16 vec1, *pv;
register vec_s32_t res = vec_splat_s32(0), t; register vec_s32 res = vec_splat_s32(0), t;
register vec_u32_t shifts; register vec_u32 shifts;
DECLARE_ALIGNED_16(int32_t, ires); DECLARE_ALIGNED_16(int32_t, ires);
shifts = zero_u32v; shifts = zero_u32v;
@ -121,7 +121,7 @@ static int32_t scalarproduct_int16_altivec(int16_t * v1, int16_t * v2, int order
if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01)); if(shift & 0x01) shifts = vec_add(shifts, vec_splat_u32(0x01));
for(i = 0; i < order; i += 8){ for(i = 0; i < order; i += 8){
pv = (vec_s16_t*)v1; pv = (vec_s16*)v1;
vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1)); vec1 = vec_perm(pv[0], pv[1], vec_lvsl(0, v1));
t = vec_msum(vec1, vec_ld(0, v2), zero_s32v); t = vec_msum(vec1, vec_ld(0, v2), zero_s32v);
t = vec_sr(t, shifts); t = vec_sr(t, shifts);

View File

@ -24,23 +24,23 @@
/*********************************************************************** /***********************************************************************
* Vector types * Vector types
**********************************************************************/ **********************************************************************/
#define vec_u8_t vector unsigned char #define vec_u8 vector unsigned char
#define vec_s8_t vector signed char #define vec_s8 vector signed char
#define vec_u16_t vector unsigned short #define vec_u16 vector unsigned short
#define vec_s16_t vector signed short #define vec_s16 vector signed short
#define vec_u32_t vector unsigned int #define vec_u32 vector unsigned int
#define vec_s32_t vector signed int #define vec_s32 vector signed int
/*********************************************************************** /***********************************************************************
* Null vector * Null vector
**********************************************************************/ **********************************************************************/
#define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 ) #define LOAD_ZERO const vec_u8 zerov = vec_splat_u8( 0 )
#define zero_u8v (vec_u8_t) zerov #define zero_u8v (vec_u8) zerov
#define zero_s8v (vec_s8_t) zerov #define zero_s8v (vec_s8) zerov
#define zero_u16v (vec_u16_t) zerov #define zero_u16v (vec_u16) zerov
#define zero_s16v (vec_s16_t) zerov #define zero_s16v (vec_s16) zerov
#define zero_u32v (vec_u32_t) zerov #define zero_u32v (vec_u32) zerov
#define zero_s32v (vec_s32_t) zerov #define zero_s32v (vec_s32) zerov
#endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */ #endif /* AVCODEC_PPC_TYPES_ALTIVEC_H */