13% faster inner_add_yblock
Originally committed as revision 5316 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
f420826c35
commit
5f22aa3cc0
@ -413,6 +413,96 @@ void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define LOAD_BLOCKS \
|
||||||
|
tmp1 = vec_ld(0, &block[3][y*src_stride]);\
|
||||||
|
align = vec_lvsl(0, &block[3][y*src_stride]);\
|
||||||
|
tmp2 = vec_ld(15, &block[3][y*src_stride]);\
|
||||||
|
\
|
||||||
|
b3 = vec_perm(tmp1,tmp2,align);\
|
||||||
|
\
|
||||||
|
tmp1 = vec_ld(0, &block[2][y*src_stride]);\
|
||||||
|
align = vec_lvsl(0, &block[2][y*src_stride]);\
|
||||||
|
tmp2 = vec_ld(15, &block[2][y*src_stride]);\
|
||||||
|
\
|
||||||
|
b2 = vec_perm(tmp1,tmp2,align);\
|
||||||
|
\
|
||||||
|
tmp1 = vec_ld(0, &block[1][y*src_stride]);\
|
||||||
|
align = vec_lvsl(0, &block[1][y*src_stride]);\
|
||||||
|
tmp2 = vec_ld(15, &block[1][y*src_stride]);\
|
||||||
|
\
|
||||||
|
b1 = vec_perm(tmp1,tmp2,align);\
|
||||||
|
\
|
||||||
|
tmp1 = vec_ld(0, &block[0][y*src_stride]);\
|
||||||
|
align = vec_lvsl(0, &block[0][y*src_stride]);\
|
||||||
|
tmp2 = vec_ld(15, &block[0][y*src_stride]);\
|
||||||
|
\
|
||||||
|
b0 = vec_perm(tmp1,tmp2,align);
|
||||||
|
|
||||||
|
#define LOAD_OBMCS \
|
||||||
|
tmp1 = vec_ld(0, obmc1);\
|
||||||
|
align = vec_lvsl(0, obmc1);\
|
||||||
|
tmp2 = vec_ld(15, obmc1);\
|
||||||
|
\
|
||||||
|
ob1 = vec_perm(tmp1,tmp2,align);\
|
||||||
|
\
|
||||||
|
tmp1 = vec_ld(0, obmc2);\
|
||||||
|
align = vec_lvsl(0, obmc2);\
|
||||||
|
tmp2 = vec_ld(15, obmc2);\
|
||||||
|
\
|
||||||
|
ob2 = vec_perm(tmp1,tmp2,align);\
|
||||||
|
\
|
||||||
|
tmp1 = vec_ld(0, obmc3);\
|
||||||
|
align = vec_lvsl(0, obmc3);\
|
||||||
|
tmp2 = vec_ld(15, obmc3);\
|
||||||
|
\
|
||||||
|
ob3 = vec_perm(tmp1,tmp2,align);\
|
||||||
|
\
|
||||||
|
tmp1 = vec_ld(0, obmc4);\
|
||||||
|
align = vec_lvsl(0, obmc4);\
|
||||||
|
tmp2 = vec_ld(15, obmc4);\
|
||||||
|
\
|
||||||
|
ob4 = vec_perm(tmp1,tmp2,align);
|
||||||
|
|
||||||
|
/* interleave logic
|
||||||
|
* h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
|
||||||
|
* h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
|
||||||
|
* h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define STEPS_0_1\
|
||||||
|
h1 = (vector unsigned short)\
|
||||||
|
vec_mergeh(ob1, ob2);\
|
||||||
|
\
|
||||||
|
h2 = (vector unsigned short)\
|
||||||
|
vec_mergeh(ob3, ob4);\
|
||||||
|
\
|
||||||
|
ih = (vector unsigned char)\
|
||||||
|
vec_mergeh(h1,h2);\
|
||||||
|
\
|
||||||
|
l1 = (vector unsigned short) vec_mergeh(b3, b2);\
|
||||||
|
\
|
||||||
|
ih1 = (vector unsigned char) vec_mergel(h1, h2);\
|
||||||
|
\
|
||||||
|
l2 = (vector unsigned short) vec_mergeh(b1, b0);\
|
||||||
|
\
|
||||||
|
il = (vector unsigned char) vec_mergeh(l1, l2);\
|
||||||
|
\
|
||||||
|
v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
|
||||||
|
\
|
||||||
|
il1 = (vector unsigned char) vec_mergel(l1, l2);\
|
||||||
|
\
|
||||||
|
v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
|
||||||
|
|
||||||
|
#define FINAL_STEP_SCALAR\
|
||||||
|
for(x=0; x<b_w; x++)\
|
||||||
|
if(add){\
|
||||||
|
vbuf[x] += dst[x + src_x];\
|
||||||
|
vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
|
||||||
|
if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
|
||||||
|
dst8[x + y*src_stride] = vbuf[x];\
|
||||||
|
}else{\
|
||||||
|
dst[x + src_x] -= vbuf[x];\
|
||||||
|
}
|
||||||
|
|
||||||
static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
|
static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
|
||||||
const int obmc_stride,
|
const int obmc_stride,
|
||||||
@ -423,11 +513,13 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
|
|||||||
{
|
{
|
||||||
int y, x;
|
int y, x;
|
||||||
DWTELEM * dst;
|
DWTELEM * dst;
|
||||||
// vector bool int mask;
|
|
||||||
// vector signed int vs;
|
|
||||||
vector unsigned short h1, h2, l1, l2;
|
vector unsigned short h1, h2, l1, l2;
|
||||||
vector unsigned char ih, il, tmp1, tmp2, align;
|
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||||
vector unsigned char b0,b1,b2,b3;
|
vector unsigned char b0,b1,b2,b3;
|
||||||
|
vector unsigned char ob1,ob2,ob3,ob4;
|
||||||
|
|
||||||
|
DECLARE_ALIGNED_16(int, vbuf[16]);
|
||||||
|
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||||
|
|
||||||
for(y=0; y<b_h; y++){
|
for(y=0; y<b_h; y++){
|
||||||
//FIXME ugly missue of obmc_stride
|
//FIXME ugly missue of obmc_stride
|
||||||
@ -436,167 +528,49 @@ static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
|
|||||||
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||||
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||||
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||||
#if 1
|
|
||||||
vector unsigned char ob1;
|
|
||||||
vector unsigned char ob2;
|
|
||||||
vector unsigned char ob3;
|
|
||||||
vector unsigned char ob4;
|
|
||||||
|
|
||||||
#endif
|
|
||||||
DECLARE_ALIGNED_16(int, vbuf[16]);
|
|
||||||
vector signed int *v = (vector signed int *)vbuf, *d;
|
|
||||||
|
|
||||||
dst = slice_buffer_get_line(sb, src_y + y);
|
dst = slice_buffer_get_line(sb, src_y + y);
|
||||||
d = (vector signed int *)(dst + src_x);
|
d = (vector signed int *)(dst + src_x);
|
||||||
|
|
||||||
#if 0
|
//FIXME i could avoid some loads!
|
||||||
for(x=0; x<b_w; x++){
|
|
||||||
vbuf[x] = obmc1[x] * block[3][x + y*src_stride]
|
|
||||||
+obmc2[x] * block[2][x + y*src_stride]
|
|
||||||
+obmc3[x] * block[1][x + y*src_stride]
|
|
||||||
+obmc4[x] * block[0][x + y*src_stride];
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
|
|
||||||
|
|
||||||
// load blocks
|
// load blocks
|
||||||
//FIXME i could avoid some loads!
|
LOAD_BLOCKS
|
||||||
tmp1 = vec_ld(0, &block[3][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[3][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[3][y*src_stride]);
|
|
||||||
|
|
||||||
b3 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, &block[2][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[2][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[2][y*src_stride]);
|
|
||||||
|
|
||||||
b2 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, &block[1][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[1][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[1][y*src_stride]);
|
|
||||||
|
|
||||||
b1 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, &block[0][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[0][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[0][y*src_stride]);
|
|
||||||
|
|
||||||
b0 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
// load obmcs
|
// load obmcs
|
||||||
|
LOAD_OBMCS
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc1);
|
// steps 0 1
|
||||||
align = vec_lvsl(0, obmc1);
|
STEPS_0_1
|
||||||
tmp2 = vec_ld(15, obmc1);
|
|
||||||
|
|
||||||
ob1 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc2);
|
|
||||||
align = vec_lvsl(0, obmc2);
|
|
||||||
tmp2 = vec_ld(15, obmc2);
|
|
||||||
|
|
||||||
ob2 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc3);
|
|
||||||
align = vec_lvsl(0, obmc3);
|
|
||||||
tmp2 = vec_ld(15, obmc3);
|
|
||||||
|
|
||||||
ob3 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc4);
|
|
||||||
align = vec_lvsl(0, obmc4);
|
|
||||||
tmp2 = vec_ld(15, obmc4);
|
|
||||||
|
|
||||||
ob4 = vec_perm(tmp1,tmp2,align);
|
|
||||||
h1 = (vector unsigned short)
|
|
||||||
vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b, a,b,a,b,
|
|
||||||
a,b,a,b, a,b,a,b ] */
|
|
||||||
h2 = (vector unsigned short)
|
|
||||||
vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d, c,d,c,d,
|
|
||||||
c,d,c,d, c,d,c,d ] */
|
|
||||||
|
|
||||||
ih = (vector unsigned char)
|
|
||||||
vec_mergeh(h1,h2); /*ih <- [ a,b,c,d, a,b,c,d,
|
|
||||||
a,b,c,d, a,b,c,d ]*/
|
|
||||||
|
|
||||||
l1 = (vector unsigned short) vec_mergeh(b3, b2);
|
|
||||||
|
|
||||||
l2 = (vector unsigned short) vec_mergeh(b1, b0);
|
|
||||||
|
|
||||||
il = (vector unsigned char) vec_mergeh(l1, l2);
|
|
||||||
|
|
||||||
v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
|
|
||||||
//step1
|
|
||||||
|
|
||||||
h1 = (vector unsigned short) vec_mergeh(ob1, ob2);
|
|
||||||
|
|
||||||
h2 = (vector unsigned short) vec_mergeh(ob3, ob4);
|
|
||||||
|
|
||||||
ih = (vector unsigned char) vec_mergel(h1, h2);
|
|
||||||
|
|
||||||
l1 = (vector unsigned short) vec_mergeh(b3, b2);
|
|
||||||
|
|
||||||
l2 = (vector unsigned short) vec_mergeh(b1, b0);
|
|
||||||
|
|
||||||
il = (vector unsigned char) vec_mergel(l1, l2);
|
|
||||||
|
|
||||||
v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if 1
|
|
||||||
for(x=0; x<b_w; x++)
|
|
||||||
if(add){
|
|
||||||
vbuf[x] += dst[x + src_x];
|
|
||||||
vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
|
|
||||||
if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);
|
|
||||||
dst8[x + y*src_stride] = vbuf[x];
|
|
||||||
}else{
|
|
||||||
dst[x + src_x] -= vbuf[x];
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if(add)
|
|
||||||
{
|
|
||||||
for(x=0; x<b_w/4; x++)
|
|
||||||
{
|
|
||||||
v[x] = vec_add(v[x], d[x]);
|
|
||||||
v[x] = vec_sra(vec_add(v[x],
|
|
||||||
vec_sl( vec_splat_s32(1),
|
|
||||||
vec_splat_u32(7))),
|
|
||||||
vec_splat_u32(8));
|
|
||||||
|
|
||||||
mask = (vector bool int)
|
|
||||||
vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),
|
|
||||||
vec_splat_u32(8));
|
|
||||||
mask = (vector bool int)
|
|
||||||
vec_and(v[x],vec_nor(mask,mask));
|
|
||||||
|
|
||||||
mask = (vector bool int)
|
|
||||||
vec_cmpeq((vector signed int)mask, vec_splat_s32(0));
|
|
||||||
|
|
||||||
vs = vec_sra(v[x],vec_splat_u32(8));
|
|
||||||
vs = vec_sra(v[x],vec_splat_u32(8));
|
|
||||||
vs = vec_sra(v[x],vec_splat_u32(15));
|
|
||||||
|
|
||||||
vs = vec_nor(vs,vs);
|
|
||||||
|
|
||||||
v[x]= vec_sel(v[x],vs,mask);
|
|
||||||
}
|
|
||||||
for(x=0; x<b_w; x++)
|
|
||||||
dst8[x + y*src_stride] = vbuf[x];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
for(x=0; x<b_w/4; x++)
|
|
||||||
d[x] = vec_sub(d[x], v[x]);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
|
FINAL_STEP_SCALAR
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#define STEPS_2_3\
|
||||||
|
h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
|
||||||
|
\
|
||||||
|
h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
|
||||||
|
\
|
||||||
|
ih = (vector unsigned char) vec_mergeh(h1,h2);\
|
||||||
|
\
|
||||||
|
l1 = (vector unsigned short) vec_mergel(b3, b2);\
|
||||||
|
\
|
||||||
|
l2 = (vector unsigned short) vec_mergel(b1, b0);\
|
||||||
|
\
|
||||||
|
ih1 = (vector unsigned char) vec_mergel(h1,h2);\
|
||||||
|
\
|
||||||
|
il = (vector unsigned char) vec_mergeh(l1,l2);\
|
||||||
|
\
|
||||||
|
v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
|
||||||
|
\
|
||||||
|
il1 = (vector unsigned char) vec_mergel(l1,l2);\
|
||||||
|
\
|
||||||
|
v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
|
||||||
|
|
||||||
|
|
||||||
static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
|
static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
|
||||||
const int obmc_stride,
|
const int obmc_stride,
|
||||||
uint8_t * * block, int b_w,
|
uint8_t * * block, int b_w,
|
||||||
@ -607,8 +581,11 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
|
|||||||
int y, x;
|
int y, x;
|
||||||
DWTELEM * dst;
|
DWTELEM * dst;
|
||||||
vector unsigned short h1, h2, l1, l2;
|
vector unsigned short h1, h2, l1, l2;
|
||||||
vector unsigned char ih, il, tmp1, tmp2, align;
|
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||||
vector unsigned char b0,b1,b2,b3;
|
vector unsigned char b0,b1,b2,b3;
|
||||||
|
vector unsigned char ob1,ob2,ob3,ob4;
|
||||||
|
DECLARE_ALIGNED_16(int, vbuf[b_w]);
|
||||||
|
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||||
|
|
||||||
for(y=0; y<b_h; y++){
|
for(y=0; y<b_h; y++){
|
||||||
//FIXME ugly missue of obmc_stride
|
//FIXME ugly missue of obmc_stride
|
||||||
@ -618,182 +595,151 @@ static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
|
|||||||
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||||
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||||
|
|
||||||
vector unsigned char ob1;
|
dst = slice_buffer_get_line(sb, src_y + y);
|
||||||
vector unsigned char ob2;
|
d = (vector signed int *)(dst + src_x);
|
||||||
vector unsigned char ob3;
|
|
||||||
vector unsigned char ob4;
|
|
||||||
|
|
||||||
|
// load blocks
|
||||||
|
LOAD_BLOCKS
|
||||||
|
|
||||||
|
// load obmcs
|
||||||
|
LOAD_OBMCS
|
||||||
|
|
||||||
|
// steps 0 1 2 3
|
||||||
|
STEPS_0_1
|
||||||
|
|
||||||
|
STEPS_2_3
|
||||||
|
|
||||||
|
FINAL_STEP_SCALAR
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define FINAL_STEP_VEC \
|
||||||
|
\
|
||||||
|
if(add)\
|
||||||
|
{\
|
||||||
|
for(x=0; x<b_w/4; x++)\
|
||||||
|
{\
|
||||||
|
v[x] = vec_add(v[x], d[x]);\
|
||||||
|
v[x] = vec_sra(vec_add(v[x],\
|
||||||
|
vec_sl( vec_splat_s32(1),\
|
||||||
|
vec_splat_u32(7))),\
|
||||||
|
vec_splat_u32(8));\
|
||||||
|
\
|
||||||
|
mask = vec_sl((vector signed int)\
|
||||||
|
vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
|
||||||
|
mask = vec_and(v[x],vec_nor(mask,mask));\
|
||||||
|
\
|
||||||
|
mask = (vector signed int)\
|
||||||
|
vec_cmpeq((vector signed int)mask,\
|
||||||
|
(vector signed int)vec_splat_u32(0));\
|
||||||
|
\
|
||||||
|
vs = vec_sra(v[x],vec_splat_u32(8));\
|
||||||
|
vs = vec_sra(v[x],vec_splat_u32(8));\
|
||||||
|
vs = vec_sra(v[x],vec_splat_u32(15));\
|
||||||
|
\
|
||||||
|
vs = vec_nor(vs,vs);\
|
||||||
|
\
|
||||||
|
v[x]= vec_sel(v[x],vs,mask);\
|
||||||
|
}\
|
||||||
|
\
|
||||||
|
for(x=0; x<b_w; x++)\
|
||||||
|
dst8[x + y*src_stride] = vbuf[x];\
|
||||||
|
\
|
||||||
|
}\
|
||||||
|
else\
|
||||||
|
for(x=0; x<b_w/4; x++)\
|
||||||
|
d[x] = vec_sub(d[x], v[x]);
|
||||||
|
|
||||||
|
static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
|
||||||
|
const int obmc_stride,
|
||||||
|
uint8_t * * block, int b_w,
|
||||||
|
int b_h, int src_x, int src_y,
|
||||||
|
int src_stride, slice_buffer * sb,
|
||||||
|
int add, uint8_t * dst8)
|
||||||
|
{
|
||||||
|
int y, x;
|
||||||
|
DWTELEM * dst;
|
||||||
|
vector bool int mask;
|
||||||
|
vector signed int vs;
|
||||||
|
vector unsigned short h1, h2, l1, l2;
|
||||||
|
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||||
|
vector unsigned char b0,b1,b2,b3;
|
||||||
|
vector unsigned char ob1,ob2,ob3,ob4;
|
||||||
|
|
||||||
|
DECLARE_ALIGNED_16(int, vbuf[16]);
|
||||||
|
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||||
|
|
||||||
|
for(y=0; y<b_h; y++){
|
||||||
|
//FIXME ugly missue of obmc_stride
|
||||||
|
|
||||||
|
uint8_t *obmc1= obmc + y*obmc_stride;
|
||||||
|
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||||
|
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||||
|
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||||
|
|
||||||
|
dst = slice_buffer_get_line(sb, src_y + y);
|
||||||
|
d = (vector signed int *)(dst + src_x);
|
||||||
|
|
||||||
|
//FIXME i could avoid some loads!
|
||||||
|
|
||||||
|
// load blocks
|
||||||
|
LOAD_BLOCKS
|
||||||
|
|
||||||
|
// load obmcs
|
||||||
|
LOAD_OBMCS
|
||||||
|
|
||||||
|
// steps 0 1
|
||||||
|
STEPS_0_1
|
||||||
|
|
||||||
|
FINAL_STEP_VEC
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
|
||||||
|
const int obmc_stride,
|
||||||
|
uint8_t * * block, int b_w,
|
||||||
|
int b_h, int src_x, int src_y,
|
||||||
|
int src_stride, slice_buffer * sb,
|
||||||
|
int add, uint8_t * dst8)
|
||||||
|
{
|
||||||
|
int y, x;
|
||||||
|
DWTELEM * dst;
|
||||||
|
vector bool int mask;
|
||||||
|
vector signed int vs;
|
||||||
|
vector unsigned short h1, h2, l1, l2;
|
||||||
|
vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
|
||||||
|
vector unsigned char b0,b1,b2,b3;
|
||||||
|
vector unsigned char ob1,ob2,ob3,ob4;
|
||||||
DECLARE_ALIGNED_16(int, vbuf[b_w]);
|
DECLARE_ALIGNED_16(int, vbuf[b_w]);
|
||||||
vector signed int *v = (vector signed int *)vbuf, *d;
|
vector signed int *v = (vector signed int *)vbuf, *d;
|
||||||
|
|
||||||
|
for(y=0; y<b_h; y++){
|
||||||
|
//FIXME ugly missue of obmc_stride
|
||||||
|
|
||||||
|
uint8_t *obmc1= obmc + y*obmc_stride;
|
||||||
|
uint8_t *obmc2= obmc1+ (obmc_stride>>1);
|
||||||
|
uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
|
||||||
|
uint8_t *obmc4= obmc3+ (obmc_stride>>1);
|
||||||
|
|
||||||
dst = slice_buffer_get_line(sb, src_y + y);
|
dst = slice_buffer_get_line(sb, src_y + y);
|
||||||
d = (vector signed int *)(dst + src_x);
|
d = (vector signed int *)(dst + src_x);
|
||||||
|
|
||||||
// load blocks
|
// load blocks
|
||||||
|
LOAD_BLOCKS
|
||||||
tmp1 = vec_ld(0, &block[3][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[3][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[3][y*src_stride]);
|
|
||||||
|
|
||||||
b3 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, &block[2][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[2][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[2][y*src_stride]);
|
|
||||||
|
|
||||||
b2 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, &block[1][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[1][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[1][y*src_stride]);
|
|
||||||
|
|
||||||
b1 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, &block[0][y*src_stride]);
|
|
||||||
align = vec_lvsl(0, &block[0][y*src_stride]);
|
|
||||||
tmp2 = vec_ld(15, &block[0][y*src_stride]);
|
|
||||||
|
|
||||||
b0 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
// load obmcs
|
// load obmcs
|
||||||
|
LOAD_OBMCS
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc1);
|
// steps 0 1 2 3
|
||||||
align = vec_lvsl(0, obmc1);
|
STEPS_0_1
|
||||||
tmp2 = vec_ld(15, obmc1);
|
|
||||||
|
|
||||||
ob1 = vec_perm(tmp1,tmp2,align);
|
STEPS_2_3
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc2);
|
FINAL_STEP_VEC
|
||||||
align = vec_lvsl(0, obmc2);
|
|
||||||
tmp2 = vec_ld(15, obmc2);
|
|
||||||
|
|
||||||
ob2 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc3);
|
|
||||||
align = vec_lvsl(0, obmc3);
|
|
||||||
tmp2 = vec_ld(15, obmc3);
|
|
||||||
|
|
||||||
ob3 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
tmp1 = vec_ld(0, obmc4);
|
|
||||||
align = vec_lvsl(0, obmc4);
|
|
||||||
tmp2 = vec_ld(15, obmc4);
|
|
||||||
|
|
||||||
ob4 = vec_perm(tmp1,tmp2,align);
|
|
||||||
|
|
||||||
//step0
|
|
||||||
h1 = (vector unsigned short)
|
|
||||||
vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b,
|
|
||||||
a,b,a,b,
|
|
||||||
a,b,a,b,
|
|
||||||
a,b,a,b ] */
|
|
||||||
h2 = (vector unsigned short)
|
|
||||||
vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d,
|
|
||||||
c,d,c,d,
|
|
||||||
c,d,c,d,
|
|
||||||
c,d,c,d ] */
|
|
||||||
|
|
||||||
ih = (vector unsigned char)
|
|
||||||
vec_mergeh(h1,h2); /*ih <- [ a,b,c,d,
|
|
||||||
a,b,c,d,
|
|
||||||
a,b,c,d,
|
|
||||||
a,b,c,d ]*/
|
|
||||||
|
|
||||||
l1 = (vector unsigned short) vec_mergeh(b3, b2);
|
|
||||||
|
|
||||||
l2 = (vector unsigned short) vec_mergeh(b1, b0);
|
|
||||||
|
|
||||||
il = (vector unsigned char) vec_mergeh(l1,l2);
|
|
||||||
|
|
||||||
v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
|
|
||||||
//step1
|
|
||||||
|
|
||||||
h1 = (vector unsigned short) vec_mergeh(ob1, ob2);
|
|
||||||
|
|
||||||
h2 = (vector unsigned short) vec_mergeh(ob3, ob4);
|
|
||||||
|
|
||||||
ih = (vector unsigned char) vec_mergel(h1,h2);
|
|
||||||
|
|
||||||
l1 = (vector unsigned short) vec_mergeh(b3, b2);
|
|
||||||
|
|
||||||
l2 = (vector unsigned short) vec_mergeh(b1, b0);
|
|
||||||
|
|
||||||
il = (vector unsigned char) vec_mergel(l1,l2);
|
|
||||||
|
|
||||||
v[1] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
|
|
||||||
|
|
||||||
//step2
|
|
||||||
h1 = (vector unsigned short) vec_mergel(ob1, ob2);
|
|
||||||
|
|
||||||
h2 = (vector unsigned short) vec_mergel(ob3, ob4);
|
|
||||||
|
|
||||||
ih = (vector unsigned char) vec_mergeh(h1,h2);
|
|
||||||
|
|
||||||
l1 = (vector unsigned short) vec_mergel(b3, b2);
|
|
||||||
|
|
||||||
l2 = (vector unsigned short) vec_mergel(b1, b0);
|
|
||||||
|
|
||||||
il = (vector unsigned char) vec_mergeh(l1,l2);
|
|
||||||
|
|
||||||
v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
|
|
||||||
|
|
||||||
//step3
|
|
||||||
h1 = (vector unsigned short) vec_mergel(ob1, ob2);
|
|
||||||
|
|
||||||
h2 = (vector unsigned short) vec_mergel(ob3, ob4);
|
|
||||||
|
|
||||||
ih = (vector unsigned char) vec_mergel(h1,h2);
|
|
||||||
|
|
||||||
l1 = (vector unsigned short) vec_mergel(b3, b2);
|
|
||||||
|
|
||||||
l2 = (vector unsigned short) vec_mergel(b1, b0);
|
|
||||||
|
|
||||||
il = (vector unsigned char) vec_mergel(l1,l2);
|
|
||||||
|
|
||||||
v[3] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));
|
|
||||||
#if 1
|
|
||||||
for(x=0; x<b_w; x++)
|
|
||||||
if(add){
|
|
||||||
vbuf[x] += dst[x + src_x];
|
|
||||||
vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
|
|
||||||
if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);
|
|
||||||
dst8[x + y*src_stride] = vbuf[x];
|
|
||||||
}else{
|
|
||||||
dst[x + src_x] -= vbuf[x];
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if(add)
|
|
||||||
{
|
|
||||||
for(x=0; x<b_w/4; x++)
|
|
||||||
{
|
|
||||||
v[x] = vec_add(v[x], d[x]);
|
|
||||||
v[x] = vec_sra(vec_add(v[x],
|
|
||||||
vec_sl( vec_splat_s32(1),
|
|
||||||
vec_splat_u32(7))),
|
|
||||||
vec_splat_u32(8));
|
|
||||||
|
|
||||||
mask = vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),vec_splat_u32(8));
|
|
||||||
mask = vec_and(v[x],vec_nor(mask,mask));
|
|
||||||
|
|
||||||
mask = (vector signed int) vec_cmpeq((vector signed int)mask,(vector signed int)vec_splat_u32(0));
|
|
||||||
|
|
||||||
vs = vec_sra(v[x],vec_splat_u32(8));
|
|
||||||
vs = vec_sra(v[x],vec_splat_u32(8));
|
|
||||||
vs = vec_sra(v[x],vec_splat_u32(15));
|
|
||||||
|
|
||||||
vs = vec_nor(vs,vs);
|
|
||||||
|
|
||||||
v[x]= vec_sel(v[x],vs,mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
for(x=0; x<b_w; x++)
|
|
||||||
dst8[x + y*src_stride] = vbuf[x];
|
|
||||||
|
|
||||||
}
|
|
||||||
else
|
|
||||||
for(x=0; x<b_w/4; x++)
|
|
||||||
d[x] = vec_sub(d[x], v[x]);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -804,17 +750,29 @@ void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
|
|||||||
slice_buffer * sb, int add,
|
slice_buffer * sb, int add,
|
||||||
uint8_t * dst8)
|
uint8_t * dst8)
|
||||||
{
|
{
|
||||||
//FIXME implement src_x&15 cases later
|
if (src_x&15) {
|
||||||
if (b_w == 16)
|
if (b_w == 16)
|
||||||
inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, b_w,
|
inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
|
||||||
b_h, src_x, src_y, src_stride,
|
b_w, b_h, src_x, src_y,
|
||||||
sb, add, dst8);
|
src_stride, sb, add, dst8);
|
||||||
else if (b_w == 8)
|
else if (b_w == 8)
|
||||||
inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
|
inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
|
||||||
b_w, b_h, src_x, src_y,
|
b_w, b_h, src_x, src_y,
|
||||||
src_stride, sb, add, dst8);
|
src_stride, sb, add, dst8);
|
||||||
else
|
else
|
||||||
|
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
|
||||||
|
src_y, src_stride, sb, add, dst8);
|
||||||
|
} else {
|
||||||
|
if (b_w == 16)
|
||||||
|
inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
|
||||||
|
b_w, b_h, src_x, src_y,
|
||||||
|
src_stride, sb, add, dst8);
|
||||||
|
else if (b_w == 8)
|
||||||
|
inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
|
||||||
|
b_w, b_h, src_x, src_y,
|
||||||
|
src_stride, sb, add, dst8);
|
||||||
|
else
|
||||||
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
|
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
|
||||||
src_y, src_stride, sb, add, dst8);
|
src_y, src_stride, sb, add, dst8);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user