H264: change weight/biweight functions to take a height argument.
Neon parts by Mans Rullgard <mans@mansr.com>.
This commit is contained in:
parent
229d263cc9
commit
c2d337429c
@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||
int beta, int8_t *tc0);
|
||||
|
||||
void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
|
||||
int weight, int offset);
|
||||
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
|
||||
int log2_den, int weight, int offset);
|
||||
|
||||
void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int log2_den, int weightd, int weights,
|
||||
int offset);
|
||||
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||
int height, int log2_den, int weightd,
|
||||
int weights, int offset);
|
||||
|
||||
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
|
||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
|
||||
@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
|
||||
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
|
||||
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
|
||||
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
|
||||
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
|
||||
c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
|
||||
c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
|
||||
c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
|
||||
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
|
||||
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
|
||||
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
|
||||
c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
|
||||
c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
|
||||
c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
|
||||
|
||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||
|
@ -1592,7 +1592,7 @@ endfunc
|
||||
vdup.8 d1, r5
|
||||
vmov q2, q8
|
||||
vmov q3, q8
|
||||
1: subs ip, ip, #2
|
||||
1: subs r3, r3, #2
|
||||
vld1.8 {d20-d21},[r0,:128], r2
|
||||
\macd q2, d0, d20
|
||||
pld [r0]
|
||||
@ -1632,7 +1632,7 @@ endfunc
|
||||
vdup.8 d1, r5
|
||||
vmov q1, q8
|
||||
vmov q10, q8
|
||||
1: subs ip, ip, #2
|
||||
1: subs r3, r3, #2
|
||||
vld1.8 {d4},[r0,:64], r2
|
||||
\macd q1, d0, d4
|
||||
pld [r0]
|
||||
@ -1662,7 +1662,7 @@ endfunc
|
||||
vdup.8 d1, r5
|
||||
vmov q1, q8
|
||||
vmov q10, q8
|
||||
1: subs ip, ip, #4
|
||||
1: subs r3, r3, #4
|
||||
vld1.32 {d4[0]},[r0,:32], r2
|
||||
vld1.32 {d4[1]},[r0,:32], r2
|
||||
\macd q1, d0, d4
|
||||
@ -1700,16 +1700,17 @@ endfunc
|
||||
.endm
|
||||
|
||||
.macro biweight_func w
|
||||
function biweight_h264_pixels_\w\()_neon
|
||||
function ff_biweight_h264_pixels_\w\()_neon, export=1
|
||||
push {r4-r6, lr}
|
||||
add r4, sp, #16
|
||||
ldr r12, [sp, #16]
|
||||
add r4, sp, #20
|
||||
ldm r4, {r4-r6}
|
||||
lsr lr, r4, #31
|
||||
add r6, r6, #1
|
||||
eors lr, lr, r5, lsr #30
|
||||
orr r6, r6, #1
|
||||
vdup.16 q9, r3
|
||||
lsl r6, r6, r3
|
||||
vdup.16 q9, r12
|
||||
lsl r6, r6, r12
|
||||
vmvn q9, q9
|
||||
vdup.16 q8, r6
|
||||
mov r6, r0
|
||||
@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro biweight_entry w, h, b=1
|
||||
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
|
||||
mov ip, #\h
|
||||
.if \b
|
||||
b biweight_h264_pixels_\w\()_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
biweight_entry 16, 8
|
||||
biweight_entry 16, 16, b=0
|
||||
biweight_func 16
|
||||
|
||||
biweight_entry 8, 16
|
||||
biweight_entry 8, 4
|
||||
biweight_entry 8, 8, b=0
|
||||
biweight_func 8
|
||||
|
||||
biweight_entry 4, 8
|
||||
biweight_entry 4, 2
|
||||
biweight_entry 4, 4, b=0
|
||||
biweight_func 4
|
||||
|
||||
@ Weighted prediction
|
||||
|
||||
.macro weight_16 add
|
||||
vdup.8 d0, r3
|
||||
1: subs ip, ip, #2
|
||||
vdup.8 d0, r12
|
||||
1: subs r2, r2, #2
|
||||
vld1.8 {d20-d21},[r0,:128], r1
|
||||
vmull.u8 q2, d0, d20
|
||||
pld [r0]
|
||||
@ -1785,8 +1767,8 @@ endfunc
|
||||
.endm
|
||||
|
||||
.macro weight_8 add
|
||||
vdup.8 d0, r3
|
||||
1: subs ip, ip, #2
|
||||
vdup.8 d0, r12
|
||||
1: subs r2, r2, #2
|
||||
vld1.8 {d4},[r0,:64], r1
|
||||
vmull.u8 q1, d0, d4
|
||||
pld [r0]
|
||||
@ -1806,10 +1788,10 @@ endfunc
|
||||
.endm
|
||||
|
||||
.macro weight_4 add
|
||||
vdup.8 d0, r3
|
||||
vdup.8 d0, r12
|
||||
vmov q1, q8
|
||||
vmov q10, q8
|
||||
1: subs ip, ip, #4
|
||||
1: subs r2, r2, #4
|
||||
vld1.32 {d4[0]},[r0,:32], r1
|
||||
vld1.32 {d4[1]},[r0,:32], r1
|
||||
vmull.u8 q1, d0, d4
|
||||
@ -1842,50 +1824,32 @@ endfunc
|
||||
.endm
|
||||
|
||||
.macro weight_func w
|
||||
function weight_h264_pixels_\w\()_neon
|
||||
function ff_weight_h264_pixels_\w\()_neon, export=1
|
||||
push {r4, lr}
|
||||
ldr r4, [sp, #8]
|
||||
cmp r2, #1
|
||||
lsl r4, r4, r2
|
||||
ldr r12, [sp, #8]
|
||||
ldr r4, [sp, #12]
|
||||
cmp r3, #1
|
||||
lsl r4, r4, r3
|
||||
vdup.16 q8, r4
|
||||
mov r4, r0
|
||||
ble 20f
|
||||
rsb lr, r2, #1
|
||||
rsb lr, r3, #1
|
||||
vdup.16 q9, lr
|
||||
cmp r3, #0
|
||||
cmp r12, #0
|
||||
blt 10f
|
||||
weight_\w vhadd.s16
|
||||
10: rsb r3, r3, #0
|
||||
10: rsb r12, r12, #0
|
||||
weight_\w vhsub.s16
|
||||
20: rsb lr, r2, #0
|
||||
20: rsb lr, r3, #0
|
||||
vdup.16 q9, lr
|
||||
cmp r3, #0
|
||||
cmp r12, #0
|
||||
blt 10f
|
||||
weight_\w vadd.s16
|
||||
10: rsb r3, r3, #0
|
||||
10: rsb r12, r12, #0
|
||||
weight_\w vsub.s16
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
.macro weight_entry w, h, b=1
|
||||
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
|
||||
mov ip, #\h
|
||||
.if \b
|
||||
b weight_h264_pixels_\w\()_neon
|
||||
.endif
|
||||
endfunc
|
||||
.endm
|
||||
|
||||
weight_entry 16, 8
|
||||
weight_entry 16, 16, b=0
|
||||
weight_func 16
|
||||
|
||||
weight_entry 8, 16
|
||||
weight_entry 8, 4
|
||||
weight_entry 8, 8, b=0
|
||||
weight_func 8
|
||||
|
||||
weight_entry 4, 8
|
||||
weight_entry 4, 2
|
||||
weight_entry 4, 4, b=0
|
||||
weight_func 4
|
||||
|
@ -438,7 +438,8 @@ static void chroma_dc_dct_c(DCTELEM *block){
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
|
||||
static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
|
||||
int height, int delta, int list,
|
||||
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
|
||||
int src_x_offset, int src_y_offset,
|
||||
qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op,
|
||||
@ -518,16 +519,16 @@ static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square,
|
||||
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
|
||||
src_cb= s->edge_emu_buffer;
|
||||
}
|
||||
chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
|
||||
chroma_op(dest_cb, src_cb, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
|
||||
|
||||
if(emu){
|
||||
s->dsp.emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, (16>>!(CHROMA422)) + 1, (mx>>3), (my>>ysh), pic_width>>1, pic_height>>!(CHROMA422));
|
||||
src_cr= s->edge_emu_buffer;
|
||||
}
|
||||
chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height << !!(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
|
||||
chroma_op(dest_cr, src_cr, h->mb_uvlinesize, height >> !(CHROMA422), mx&7, (my << !!(CHROMA422)) &7);
|
||||
}
|
||||
|
||||
static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
|
||||
static inline void mc_part_std(H264Context *h, int n, int square, int height, int delta,
|
||||
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
|
||||
int x_offset, int y_offset,
|
||||
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
|
||||
@ -553,7 +554,7 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
|
||||
|
||||
if(list0){
|
||||
Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
|
||||
mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
|
||||
mc_dir_part(h, ref, n, square, height, delta, 0,
|
||||
dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
qpix_op, chroma_op, pixel_shift, chroma444);
|
||||
|
||||
@ -563,13 +564,13 @@ static inline void mc_part_std(H264Context *h, int n, int square, int chroma_hei
|
||||
|
||||
if(list1){
|
||||
Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
|
||||
mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
|
||||
mc_dir_part(h, ref, n, square, height, delta, 1,
|
||||
dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
qpix_op, chroma_op, pixel_shift, chroma444);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
|
||||
static inline void mc_part_weighted(H264Context *h, int n, int square, int height, int delta,
|
||||
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
|
||||
int x_offset, int y_offset,
|
||||
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
|
||||
@ -577,17 +578,21 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
|
||||
h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
|
||||
int list0, int list1, int pixel_shift, int chroma444){
|
||||
MpegEncContext * const s = &h->s;
|
||||
int chroma_height;
|
||||
|
||||
dest_y += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
|
||||
if(chroma444){
|
||||
chroma_height = height;
|
||||
chroma_weight_avg = luma_weight_avg;
|
||||
chroma_weight_op = luma_weight_op;
|
||||
dest_cb += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
|
||||
dest_cr += (2*x_offset << pixel_shift) + 2*y_offset*h->mb_linesize;
|
||||
} else if (CHROMA422) {
|
||||
chroma_height = height;
|
||||
dest_cb += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
|
||||
dest_cr += ( x_offset << pixel_shift) + 2*y_offset*h->mb_uvlinesize;
|
||||
}else{
|
||||
chroma_height = height >> 1;
|
||||
dest_cb += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
|
||||
dest_cr += ( x_offset << pixel_shift) + y_offset*h->mb_uvlinesize;
|
||||
}
|
||||
@ -603,78 +608,53 @@ static inline void mc_part_weighted(H264Context *h, int n, int square, int chrom
|
||||
int refn0 = h->ref_cache[0][ scan8[n] ];
|
||||
int refn1 = h->ref_cache[1][ scan8[n] ];
|
||||
|
||||
mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
|
||||
mc_dir_part(h, &h->ref_list[0][refn0], n, square, height, delta, 0,
|
||||
dest_y, dest_cb, dest_cr,
|
||||
x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
|
||||
mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
|
||||
mc_dir_part(h, &h->ref_list[1][refn1], n, square, height, delta, 1,
|
||||
tmp_y, tmp_cb, tmp_cr,
|
||||
x_offset, y_offset, qpix_put, chroma_put, pixel_shift, chroma444);
|
||||
|
||||
if(h->use_weight == 2){
|
||||
int weight0 = h->implicit_weight[refn0][refn1][s->mb_y&1];
|
||||
int weight1 = 64 - weight0;
|
||||
luma_weight_avg( dest_y, tmp_y, h-> mb_linesize, 5, weight0, weight1, 0);
|
||||
chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
|
||||
chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
|
||||
if (CHROMA422) {
|
||||
chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
|
||||
tmp_cb + chroma_height * h->mb_uvlinesize,
|
||||
h->mb_uvlinesize, 5, weight0, weight1, 0);
|
||||
chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
|
||||
tmp_cr + chroma_height * h->mb_uvlinesize,
|
||||
h->mb_uvlinesize, 5, weight0, weight1, 0);
|
||||
}
|
||||
luma_weight_avg( dest_y, tmp_y, h-> mb_linesize,
|
||||
height, 5, weight0, weight1, 0);
|
||||
chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize,
|
||||
chroma_height, 5, weight0, weight1, 0);
|
||||
chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize,
|
||||
chroma_height, 5, weight0, weight1, 0);
|
||||
}else{
|
||||
luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
|
||||
luma_weight_avg(dest_y, tmp_y, h->mb_linesize, height, h->luma_log2_weight_denom,
|
||||
h->luma_weight[refn0][0][0] , h->luma_weight[refn1][1][0],
|
||||
h->luma_weight[refn0][0][1] + h->luma_weight[refn1][1][1]);
|
||||
chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
|
||||
h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
|
||||
chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
|
||||
h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
|
||||
if (CHROMA422) {
|
||||
chroma_weight_avg(dest_cb + chroma_height * h->mb_uvlinesize,
|
||||
tmp_cb + chroma_height * h->mb_uvlinesize,
|
||||
h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn0][0][0][0] , h->chroma_weight[refn1][1][0][0],
|
||||
h->chroma_weight[refn0][0][0][1] + h->chroma_weight[refn1][1][0][1]);
|
||||
chroma_weight_avg(dest_cr + chroma_height * h->mb_uvlinesize,
|
||||
tmp_cr + chroma_height * h->mb_uvlinesize,
|
||||
h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn0][0][1][0] , h->chroma_weight[refn1][1][1][0],
|
||||
h->chroma_weight[refn0][0][1][1] + h->chroma_weight[refn1][1][1][1]);
|
||||
}
|
||||
}
|
||||
}else{
|
||||
int list = list1 ? 1 : 0;
|
||||
int refn = h->ref_cache[list][ scan8[n] ];
|
||||
Picture *ref= &h->ref_list[list][refn];
|
||||
mc_dir_part(h, ref, n, square, chroma_height, delta, list,
|
||||
mc_dir_part(h, ref, n, square, height, delta, list,
|
||||
dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
qpix_put, chroma_put, pixel_shift, chroma444);
|
||||
|
||||
luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
|
||||
luma_weight_op(dest_y, h->mb_linesize, height, h->luma_log2_weight_denom,
|
||||
h->luma_weight[refn][list][0], h->luma_weight[refn][list][1]);
|
||||
if(h->use_weight_chroma){
|
||||
chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
chroma_weight_op(dest_cb, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
|
||||
chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
chroma_weight_op(dest_cr, h->mb_uvlinesize, chroma_height, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
|
||||
if (CHROMA422) {
|
||||
chroma_weight_op(dest_cb + chroma_height * h->mb_uvlinesize,
|
||||
h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn][list][0][0], h->chroma_weight[refn][list][0][1]);
|
||||
chroma_weight_op(dest_cr + chroma_height * h->mb_uvlinesize,
|
||||
h->mb_uvlinesize, h->chroma_log2_weight_denom,
|
||||
h->chroma_weight[refn][list][1][0], h->chroma_weight[refn][list][1][1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
|
||||
static inline void mc_part(H264Context *h, int n, int square, int height, int delta,
|
||||
uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
|
||||
int x_offset, int y_offset,
|
||||
qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
|
||||
@ -684,12 +664,12 @@ static inline void mc_part(H264Context *h, int n, int square, int chroma_height,
|
||||
if((h->use_weight==2 && list0 && list1
|
||||
&& (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ][h->s.mb_y&1] != 32))
|
||||
|| h->use_weight==1)
|
||||
mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
|
||||
mc_part_weighted(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
|
||||
x_offset, y_offset, qpix_put, chroma_put,
|
||||
weight_op[0], weight_op[3], weight_avg[0],
|
||||
weight_avg[3], list0, list1, pixel_shift, chroma444);
|
||||
weight_op[0], weight_op[1], weight_avg[0],
|
||||
weight_avg[1], list0, list1, pixel_shift, chroma444);
|
||||
else
|
||||
mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
|
||||
mc_part_std(h, n, square, height, delta, dest_y, dest_cb, dest_cr,
|
||||
x_offset, y_offset, qpix_put, chroma_put, qpix_avg,
|
||||
chroma_avg, list0, list1, pixel_shift, chroma444);
|
||||
}
|
||||
@ -731,31 +711,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
|
||||
prefetch_motion(h, 0, pixel_shift, chroma444);
|
||||
|
||||
if(IS_16X16(mb_type)){
|
||||
mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
|
||||
mc_part(h, 0, 1, 16, 0, dest_y, dest_cb, dest_cr, 0, 0,
|
||||
qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
|
||||
weight_op, weight_avg,
|
||||
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
}else if(IS_16X8(mb_type)){
|
||||
mc_part(h, 0, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
|
||||
mc_part(h, 0, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 0,
|
||||
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
weight_op, weight_avg,
|
||||
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
mc_part(h, 8, 0, 4, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
|
||||
mc_part(h, 8, 0, 8, 8 << pixel_shift, dest_y, dest_cb, dest_cr, 0, 4,
|
||||
qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
weight_op, weight_avg,
|
||||
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
|
||||
pixel_shift, chroma444);
|
||||
}else if(IS_8X16(mb_type)){
|
||||
mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
|
||||
mc_part(h, 0, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
|
||||
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
|
||||
&weight_op[2], &weight_avg[2],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
|
||||
mc_part(h, 4, 0, 16, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
|
||||
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
|
||||
&weight_op[2], &weight_avg[2],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1),
|
||||
pixel_shift, chroma444);
|
||||
}else{
|
||||
@ -770,31 +750,31 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
|
||||
int y_offset= (i&2)<<1;
|
||||
|
||||
if(IS_SUB_8X8(sub_mb_type)){
|
||||
mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
mc_part(h, n, 1, 8, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
|
||||
&weight_op[3], &weight_avg[3],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
}else if(IS_SUB_8X4(sub_mb_type)){
|
||||
mc_part(h, n , 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
mc_part(h, n , 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
|
||||
&weight_op[4], &weight_avg[4],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
mc_part(h, n+2, 0, 2, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
|
||||
mc_part(h, n+2, 0, 4, 4 << pixel_shift, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
|
||||
qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
|
||||
&weight_op[4], &weight_avg[4],
|
||||
&weight_op[1], &weight_avg[1],
|
||||
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
}else if(IS_SUB_4X8(sub_mb_type)){
|
||||
mc_part(h, n , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
mc_part(h, n , 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
|
||||
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
|
||||
&weight_op[5], &weight_avg[5],
|
||||
&weight_op[2], &weight_avg[2],
|
||||
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
|
||||
mc_part(h, n+1, 0, 8, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
|
||||
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
|
||||
&weight_op[5], &weight_avg[5],
|
||||
&weight_op[2], &weight_avg[2],
|
||||
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
}else{
|
||||
@ -803,9 +783,9 @@ static av_always_inline void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t
|
||||
for(j=0; j<4; j++){
|
||||
int sub_x_offset= x_offset + 2*(j&1);
|
||||
int sub_y_offset= y_offset + (j&2);
|
||||
mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
|
||||
mc_part(h, n+j, 1, 4, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
|
||||
qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
|
||||
&weight_op[6], &weight_avg[6],
|
||||
&weight_op[2], &weight_avg[2],
|
||||
IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1),
|
||||
pixel_shift, chroma444);
|
||||
}
|
||||
|
@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
|
||||
else\
|
||||
c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
|
||||
\
|
||||
c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
|
||||
c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
|
||||
c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
|
||||
c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
|
||||
c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
|
||||
c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
|
||||
c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
|
||||
c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
|
||||
c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
|
||||
c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
|
||||
c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
|
||||
c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
|
||||
c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
|
||||
c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
|
||||
c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
|
||||
c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
|
||||
c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
|
||||
c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
|
||||
c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
|
||||
c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
|
||||
c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
|
||||
c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
|
||||
c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
|
||||
c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
|
||||
c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
|
||||
c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
|
||||
c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
|
||||
c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
|
||||
\
|
||||
c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
|
||||
c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
|
||||
|
@ -31,16 +31,18 @@
|
||||
#include "dsputil.h"
|
||||
|
||||
//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
|
||||
typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
|
||||
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
|
||||
typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
|
||||
int log2_denom, int weight, int offset);
|
||||
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
|
||||
int log2_denom, int weightd, int weights, int offset);
|
||||
|
||||
/**
|
||||
* Context for storing H.264 DSP functions
|
||||
*/
|
||||
typedef struct H264DSPContext{
|
||||
/* weighted MC */
|
||||
h264_weight_func weight_h264_pixels_tab[10];
|
||||
h264_biweight_func biweight_h264_pixels_tab[10];
|
||||
h264_weight_func weight_h264_pixels_tab[4];
|
||||
h264_biweight_func biweight_h264_pixels_tab[4];
|
||||
|
||||
/* loop filter */
|
||||
void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
|
||||
|
@ -29,14 +29,16 @@
|
||||
|
||||
#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
|
||||
#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
|
||||
#define H264_WEIGHT(W,H) \
|
||||
static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
|
||||
#define H264_WEIGHT(W) \
|
||||
static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
|
||||
int log2_denom, int weight, int offset) \
|
||||
{ \
|
||||
int y; \
|
||||
pixel *block = (pixel*)_block; \
|
||||
stride /= sizeof(pixel); \
|
||||
offset <<= (log2_denom + (BIT_DEPTH-8)); \
|
||||
if(log2_denom) offset += 1<<(log2_denom-1); \
|
||||
for(y=0; y<H; y++, block += stride){ \
|
||||
for (y = 0; y < height; y++, block += stride) { \
|
||||
op_scale1(0); \
|
||||
op_scale1(1); \
|
||||
if(W==2) continue; \
|
||||
@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
|
||||
op_scale1(15); \
|
||||
} \
|
||||
} \
|
||||
static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
|
||||
static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
|
||||
int log2_denom, int weightd, int weights, int offset) \
|
||||
{ \
|
||||
int y; \
|
||||
pixel *dst = (pixel*)_dst; \
|
||||
pixel *src = (pixel*)_src; \
|
||||
stride /= sizeof(pixel); \
|
||||
offset <<= (BIT_DEPTH-8); \
|
||||
offset = ((offset + 1) | 1) << log2_denom; \
|
||||
for(y=0; y<H; y++, dst += stride, src += stride){ \
|
||||
for (y = 0; y < height; y++, dst += stride, src += stride) { \
|
||||
op_scale2(0); \
|
||||
op_scale2(1); \
|
||||
if(W==2) continue; \
|
||||
@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
|
||||
} \
|
||||
}
|
||||
|
||||
H264_WEIGHT(16,16)
|
||||
H264_WEIGHT(16,8)
|
||||
H264_WEIGHT(8,16)
|
||||
H264_WEIGHT(8,8)
|
||||
H264_WEIGHT(8,4)
|
||||
H264_WEIGHT(4,8)
|
||||
H264_WEIGHT(4,4)
|
||||
H264_WEIGHT(4,2)
|
||||
H264_WEIGHT(2,4)
|
||||
H264_WEIGHT(2,2)
|
||||
H264_WEIGHT(16)
|
||||
H264_WEIGHT(8)
|
||||
H264_WEIGHT(4)
|
||||
H264_WEIGHT(2)
|
||||
|
||||
#undef op_scale1
|
||||
#undef op_scale2
|
||||
|
@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
|
||||
}
|
||||
|
||||
static av_always_inline
|
||||
void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
|
||||
void weight_h264_W_altivec(uint8_t *block, int stride, int height,
|
||||
int log2_denom, int weight, int offset, int w)
|
||||
{
|
||||
int y, aligned;
|
||||
vec_u8 vblock;
|
||||
@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
|
||||
voffset = vec_splat(vtemp, 5);
|
||||
aligned = !((unsigned long)block & 0xf);
|
||||
|
||||
for (y=0; y<h; y++) {
|
||||
for (y = 0; y < height; y++) {
|
||||
vblock = vec_ld(0, block);
|
||||
|
||||
v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
|
||||
@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
|
||||
}
|
||||
|
||||
static av_always_inline
|
||||
void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
|
||||
int weightd, int weights, int offset, int w, int h)
|
||||
void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
|
||||
int log2_denom, int weightd, int weights, int offset, int w)
|
||||
{
|
||||
int y, dst_aligned, src_aligned;
|
||||
vec_u8 vsrc, vdst;
|
||||
@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
|
||||
dst_aligned = !((unsigned long)dst & 0xf);
|
||||
src_aligned = !((unsigned long)src & 0xf);
|
||||
|
||||
for (y=0; y<h; y++) {
|
||||
for (y = 0; y < height; y++) {
|
||||
vdst = vec_ld(0, dst);
|
||||
vsrc = vec_ld(0, src);
|
||||
|
||||
@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
|
||||
}
|
||||
}
|
||||
|
||||
#define H264_WEIGHT(W,H) \
|
||||
static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
|
||||
weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
|
||||
#define H264_WEIGHT(W) \
|
||||
static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
|
||||
int log2_denom, int weight, int offset){ \
|
||||
weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
|
||||
}\
|
||||
static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
|
||||
biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
|
||||
static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
|
||||
int log2_denom, int weightd, int weights, int offset){ \
|
||||
biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
|
||||
}
|
||||
|
||||
H264_WEIGHT(16,16)
|
||||
H264_WEIGHT(16, 8)
|
||||
H264_WEIGHT( 8,16)
|
||||
H264_WEIGHT( 8, 8)
|
||||
H264_WEIGHT( 8, 4)
|
||||
H264_WEIGHT(16)
|
||||
H264_WEIGHT( 8)
|
||||
|
||||
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
|
||||
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
|
||||
@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
|
||||
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
|
||||
c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
|
||||
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
|
||||
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
|
||||
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
|
||||
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
|
||||
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
|
||||
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
|
||||
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
|
||||
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
|
||||
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
|
||||
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -28,21 +28,20 @@ SECTION .text
|
||||
;-----------------------------------------------------------------------------
|
||||
; biweight pred:
|
||||
;
|
||||
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
|
||||
; int log2_denom, int weightd, int weights,
|
||||
; int offset);
|
||||
; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
|
||||
; int height, int log2_denom, int weightd,
|
||||
; int weights, int offset);
|
||||
; and
|
||||
; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
|
||||
; int log2_denom, int weight,
|
||||
; int offset);
|
||||
; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
|
||||
; int log2_denom, int weight, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro WEIGHT_SETUP 0
|
||||
add r4, r4
|
||||
inc r4
|
||||
movd m3, r3d
|
||||
movd m5, r4d
|
||||
movd m6, r2d
|
||||
add r5, r5
|
||||
inc r5
|
||||
movd m3, r4d
|
||||
movd m5, r5d
|
||||
movd m6, r3d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
%if mmsize == 16
|
||||
@ -71,60 +70,41 @@ SECTION .text
|
||||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_FUNC_DBL_MM 1
|
||||
cglobal h264_weight_16x%1_mmx2, 5, 5, 0
|
||||
INIT_MMX
|
||||
cglobal h264_weight_16_mmx2, 6, 6, 0
|
||||
WEIGHT_SETUP
|
||||
mov r2, %1
|
||||
%if %1 == 16
|
||||
.nextrow
|
||||
WEIGHT_OP 0, 4
|
||||
mova [r0 ], m0
|
||||
WEIGHT_OP 8, 12
|
||||
mova [r0+8], m0
|
||||
add r0, r1
|
||||
dec r2
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
WEIGHT_FUNC_DBL_MM 16
|
||||
WEIGHT_FUNC_DBL_MM 8
|
||||
|
||||
%macro WEIGHT_FUNC_MM 4
|
||||
cglobal h264_weight_%1x%2_%4, 7, 7, %3
|
||||
%macro WEIGHT_FUNC_MM 3
|
||||
cglobal h264_weight_%1_%3, 6, 6, %2
|
||||
WEIGHT_SETUP
|
||||
mov r2, %2
|
||||
%if %2 == 16
|
||||
.nextrow
|
||||
WEIGHT_OP 0, mmsize/2
|
||||
mova [r0], m0
|
||||
add r0, r1
|
||||
dec r2
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
WEIGHT_FUNC_MM 8, 16, 0, mmx2
|
||||
WEIGHT_FUNC_MM 8, 8, 0, mmx2
|
||||
WEIGHT_FUNC_MM 8, 4, 0, mmx2
|
||||
WEIGHT_FUNC_MM 8, 0, mmx2
|
||||
INIT_XMM
|
||||
WEIGHT_FUNC_MM 16, 16, 8, sse2
|
||||
WEIGHT_FUNC_MM 16, 8, 8, sse2
|
||||
WEIGHT_FUNC_MM 16, 8, sse2
|
||||
|
||||
%macro WEIGHT_FUNC_HALF_MM 5
|
||||
cglobal h264_weight_%1x%2_%5, 5, 5, %4
|
||||
%macro WEIGHT_FUNC_HALF_MM 3
|
||||
cglobal h264_weight_%1_%3, 6, 6, %2
|
||||
WEIGHT_SETUP
|
||||
mov r2, %2/2
|
||||
sar r2d, 1
|
||||
lea r3, [r1*2]
|
||||
%if %2 == mmsize
|
||||
.nextrow
|
||||
WEIGHT_OP 0, r1
|
||||
movh [r0], m0
|
||||
@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
|
||||
movh [r0+r1], m0
|
||||
%endif
|
||||
add r0, r3
|
||||
dec r2
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
|
||||
WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
|
||||
WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
|
||||
WEIGHT_FUNC_HALF_MM 4, 0, mmx2
|
||||
WEIGHT_FUNC_HALF_MM 4, 0, mmx2
|
||||
WEIGHT_FUNC_HALF_MM 4, 0, mmx2
|
||||
INIT_XMM
|
||||
WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 8, sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 8, sse2
|
||||
WEIGHT_FUNC_HALF_MM 8, 8, sse2
|
||||
|
||||
%macro BIWEIGHT_SETUP 0
|
||||
add r6, 1
|
||||
or r6, 1
|
||||
add r3, 1
|
||||
movd m3, r4d
|
||||
movd m4, r5d
|
||||
movd m5, r6d
|
||||
movd m6, r3d
|
||||
%ifdef ARCH_X86_64
|
||||
%define off_regd r11d
|
||||
%else
|
||||
%define off_regd r3d
|
||||
%endif
|
||||
mov off_regd, r7m
|
||||
add off_regd, 1
|
||||
or off_regd, 1
|
||||
add r4, 1
|
||||
movd m3, r5d
|
||||
movd m4, r6d
|
||||
movd m5, off_regd
|
||||
movd m6, r4d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
%if mmsize == 16
|
||||
@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
|
||||
packuswb m0, m1
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_FUNC_DBL_MM 1
|
||||
cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
|
||||
INIT_MMX
|
||||
cglobal h264_biweight_16_mmx2, 7, 7, 0
|
||||
BIWEIGHT_SETUP
|
||||
mov r3, %1
|
||||
%if %1 == 16
|
||||
movifnidn r3d, r3m
|
||||
.nextrow
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, 4
|
||||
@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
|
||||
mova [r0+8], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
BIWEIGHT_FUNC_DBL_MM 16
|
||||
BIWEIGHT_FUNC_DBL_MM 8
|
||||
|
||||
%macro BIWEIGHT_FUNC_MM 4
|
||||
cglobal h264_biweight_%1x%2_%4, 7, 7, %3
|
||||
%macro BIWEIGHT_FUNC_MM 3
|
||||
cglobal h264_biweight_%1_%3, 7, 7, %2
|
||||
BIWEIGHT_SETUP
|
||||
mov r3, %2
|
||||
%if %2 == 16
|
||||
movifnidn r3d, r3m
|
||||
.nextrow
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, mmsize/2
|
||||
@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
BIWEIGHT_FUNC_MM 8, 16, 0, mmx2
|
||||
BIWEIGHT_FUNC_MM 8, 8, 0, mmx2
|
||||
BIWEIGHT_FUNC_MM 8, 4, 0, mmx2
|
||||
BIWEIGHT_FUNC_MM 8, 0, mmx2
|
||||
INIT_XMM
|
||||
BIWEIGHT_FUNC_MM 16, 16, 8, sse2
|
||||
BIWEIGHT_FUNC_MM 16, 8, 8, sse2
|
||||
BIWEIGHT_FUNC_MM 16, 8, sse2
|
||||
|
||||
%macro BIWEIGHT_FUNC_HALF_MM 5
|
||||
cglobal h264_biweight_%1x%2_%5, 7, 7, %4
|
||||
%macro BIWEIGHT_FUNC_HALF_MM 3
|
||||
cglobal h264_biweight_%1_%3, 7, 7, %2
|
||||
BIWEIGHT_SETUP
|
||||
mov r3, %2/2
|
||||
movifnidn r3d, r3m
|
||||
sar r3, 1
|
||||
lea r4, [r2*2]
|
||||
%if %2 == mmsize
|
||||
.nextrow
|
||||
BIWEIGHT_STEPA 0, 1, 0
|
||||
BIWEIGHT_STEPA 1, 2, r2
|
||||
@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
|
||||
%endif
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_MMX
|
||||
BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
|
||||
BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
|
||||
BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
|
||||
BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
|
||||
INIT_XMM
|
||||
BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
|
||||
BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
|
||||
BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
|
||||
BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
|
||||
|
||||
%macro BIWEIGHT_SSSE3_SETUP 0
|
||||
add r6, 1
|
||||
or r6, 1
|
||||
add r3, 1
|
||||
movd m4, r4d
|
||||
movd m0, r5d
|
||||
movd m5, r6d
|
||||
movd m6, r3d
|
||||
%ifdef ARCH_X86_64
|
||||
%define off_regd r11d
|
||||
%else
|
||||
%define off_regd r3d
|
||||
%endif
|
||||
mov off_regd, r7m
|
||||
add off_regd, 1
|
||||
or off_regd, 1
|
||||
add r4, 1
|
||||
movd m4, r5d
|
||||
movd m0, r6d
|
||||
movd m5, off_regd
|
||||
movd m6, r4d
|
||||
pslld m5, m6
|
||||
psrld m5, 1
|
||||
punpcklbw m4, m0
|
||||
@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
|
||||
packuswb m0, m2
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_SSSE3_16 1
|
||||
cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
|
||||
INIT_XMM
|
||||
cglobal h264_biweight_16_ssse3, 7, 7, 8
|
||||
BIWEIGHT_SSSE3_SETUP
|
||||
mov r3, %1
|
||||
movifnidn r3d, r3m
|
||||
|
||||
%if %1 == 16
|
||||
.nextrow
|
||||
movh m0, [r0]
|
||||
movh m2, [r0+8]
|
||||
@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
|
||||
mova [r0], m0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
BIWEIGHT_SSSE3_16 16
|
||||
BIWEIGHT_SSSE3_16 8
|
||||
|
||||
%macro BIWEIGHT_SSSE3_8 1
|
||||
cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
|
||||
cglobal h264_biweight_8_ssse3, 7, 7, 8
|
||||
BIWEIGHT_SSSE3_SETUP
|
||||
mov r3, %1/2
|
||||
movifnidn r3d, r3m
|
||||
sar r3, 1
|
||||
lea r4, [r2*2]
|
||||
|
||||
%if %1 == 16
|
||||
.nextrow
|
||||
movh m0, [r0]
|
||||
movh m1, [r1]
|
||||
@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
|
||||
movhps [r0+r2], m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec r3
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
%else
|
||||
jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
BIWEIGHT_SSSE3_8 16
|
||||
BIWEIGHT_SSSE3_8 8
|
||||
BIWEIGHT_SSSE3_8 4
|
||||
|
@ -36,33 +36,26 @@ cextern pw_1
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_weight(uint8_t *dst, int stride, int log2_denom,
|
||||
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
|
||||
; int weight, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
%ifdef ARCH_X86_32
|
||||
DECLARE_REG_TMP 2
|
||||
%else
|
||||
DECLARE_REG_TMP 10
|
||||
%endif
|
||||
|
||||
%macro WEIGHT_PROLOGUE 1
|
||||
mov t0, %1
|
||||
%macro WEIGHT_PROLOGUE 0
|
||||
.prologue
|
||||
PROLOGUE 0,5,8
|
||||
PROLOGUE 0,6,8
|
||||
movifnidn r0, r0mp
|
||||
movifnidn r1d, r1m
|
||||
movifnidn r3d, r3m
|
||||
movifnidn r4d, r4m
|
||||
movifnidn r5d, r5m
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_SETUP 1
|
||||
mova m0, [pw_1]
|
||||
movd m2, r2m
|
||||
movd m2, r3m
|
||||
pslld m0, m2 ; 1<<log2_denom
|
||||
SPLATW m0, m0
|
||||
shl r4, 19 ; *8, move to upper half of dword
|
||||
lea r4, [r4+r3*2+0x10000]
|
||||
movd m3, r4d ; weight<<1 | 1+(offset<<(3))
|
||||
shl r5, 19 ; *8, move to upper half of dword
|
||||
lea r5, [r5+r4*2+0x10000]
|
||||
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
|
||||
pshufd m3, m3, 0
|
||||
mova m4, [pw_pixel_max]
|
||||
paddw m2, [sq_1] ; log2_denom+1
|
||||
@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_FUNC_DBL 1
|
||||
cglobal h264_weight_16x16_10_%1
|
||||
WEIGHT_PROLOGUE 16
|
||||
cglobal h264_weight_16_10_%1
|
||||
WEIGHT_PROLOGUE
|
||||
WEIGHT_SETUP %1
|
||||
.nextrow
|
||||
WEIGHT_OP %1, 0
|
||||
@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
|
||||
WEIGHT_OP %1, 16
|
||||
mova [r0+16], m5
|
||||
add r0, r1
|
||||
dec t0
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal h264_weight_16x8_10_%1
|
||||
mov t0, 8
|
||||
jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
|
||||
|
||||
|
||||
%macro WEIGHT_FUNC_MM 1
|
||||
cglobal h264_weight_8x16_10_%1
|
||||
WEIGHT_PROLOGUE 16
|
||||
cglobal h264_weight_8_10_%1
|
||||
WEIGHT_PROLOGUE
|
||||
WEIGHT_SETUP %1
|
||||
.nextrow
|
||||
WEIGHT_OP %1, 0
|
||||
mova [r0], m5
|
||||
add r0, r1
|
||||
dec t0
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal h264_weight_8x8_10_%1
|
||||
mov t0, 8
|
||||
jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
|
||||
|
||||
cglobal h264_weight_8x4_10_%1
|
||||
mov t0, 4
|
||||
jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
|
||||
|
||||
|
||||
%macro WEIGHT_FUNC_HALF_MM 1
|
||||
cglobal h264_weight_4x8_10_%1
|
||||
WEIGHT_PROLOGUE 4
|
||||
cglobal h264_weight_4_10_%1
|
||||
WEIGHT_PROLOGUE
|
||||
sar r2d, 1
|
||||
WEIGHT_SETUP %1
|
||||
lea r3, [r1*2]
|
||||
.nextrow
|
||||
@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
|
||||
movh [r0], m5
|
||||
movhps [r0+r1], m5
|
||||
add r0, r3
|
||||
dec t0
|
||||
dec r2d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal h264_weight_4x4_10_%1
|
||||
mov t0, 2
|
||||
jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
|
||||
|
||||
cglobal h264_weight_4x2_10_%1
|
||||
mov t0, 1
|
||||
jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
|
||||
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
|
||||
; int weightd, int weights, int offset);
|
||||
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
|
||||
; int log2_denom, int weightd, int weights, int offset);
|
||||
;-----------------------------------------------------------------------------
|
||||
%ifdef ARCH_X86_32
|
||||
DECLARE_REG_TMP 2,3
|
||||
DECLARE_REG_TMP 3
|
||||
%else
|
||||
DECLARE_REG_TMP 10,2
|
||||
DECLARE_REG_TMP 10
|
||||
%endif
|
||||
|
||||
%macro BIWEIGHT_PROLOGUE 1
|
||||
mov t0, %1
|
||||
%macro BIWEIGHT_PROLOGUE 0
|
||||
.prologue
|
||||
PROLOGUE 0,7,8
|
||||
movifnidn r0, r0mp
|
||||
movifnidn r1, r1mp
|
||||
movifnidn t1d, r2m
|
||||
movifnidn r4d, r4m
|
||||
movifnidn r2d, r2m
|
||||
movifnidn r5d, r5m
|
||||
movifnidn r6d, r6m
|
||||
movifnidn t0d, r7m
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_SETUP 1
|
||||
lea r6, [r6*4+1] ; (offset<<2)+1
|
||||
or r6, 1
|
||||
shl r5, 16
|
||||
or r4, r5
|
||||
movd m4, r4d ; weightd | weights
|
||||
movd m5, r6d ; (offset+1)|1
|
||||
movd m6, r3m ; log2_denom
|
||||
lea t0, [t0*4+1] ; (offset<<2)+1
|
||||
or t0, 1
|
||||
shl r6, 16
|
||||
or r5, r6
|
||||
movd m4, r5d ; weightd | weights
|
||||
movd m5, t0d ; (offset+1)|1
|
||||
movd m6, r4m ; log2_denom
|
||||
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
|
||||
paddd m6, [sq_1]
|
||||
pshufd m4, m4, 0
|
||||
pshufd m5, m5, 0
|
||||
mova m3, [pw_pixel_max]
|
||||
movifnidn r3d, r3m
|
||||
%ifnidn %1, sse4
|
||||
pxor m7, m7
|
||||
%endif
|
||||
@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
|
||||
%endmacro
|
||||
|
||||
%macro BIWEIGHT_FUNC_DBL 1
|
||||
cglobal h264_biweight_16x16_10_%1
|
||||
BIWEIGHT_PROLOGUE 16
|
||||
cglobal h264_biweight_16_10_%1
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP %1
|
||||
.nextrow
|
||||
BIWEIGHT %1, 0
|
||||
mova [r0 ], m0
|
||||
BIWEIGHT %1, 16
|
||||
mova [r0+16], m0
|
||||
add r0, t1
|
||||
add r1, t1
|
||||
dec t0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal h264_biweight_16x8_10_%1
|
||||
mov t0, 8
|
||||
jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
|
||||
BIWEIGHT_FUNC_DBL sse4
|
||||
|
||||
%macro BIWEIGHT_FUNC 1
|
||||
cglobal h264_biweight_8x16_10_%1
|
||||
BIWEIGHT_PROLOGUE 16
|
||||
cglobal h264_biweight_8_10_%1
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP %1
|
||||
.nextrow
|
||||
BIWEIGHT %1, 0
|
||||
mova [r0], m0
|
||||
add r0, t1
|
||||
add r1, t1
|
||||
dec t0
|
||||
add r0, r2
|
||||
add r1, r2
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal h264_biweight_8x8_10_%1
|
||||
mov t0, 8
|
||||
jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
|
||||
|
||||
cglobal h264_biweight_8x4_10_%1
|
||||
mov t0, 4
|
||||
jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
|
||||
BIWEIGHT_FUNC sse4
|
||||
|
||||
%macro BIWEIGHT_FUNC_HALF 1
|
||||
cglobal h264_biweight_4x8_10_%1
|
||||
BIWEIGHT_PROLOGUE 4
|
||||
cglobal h264_biweight_4_10_%1
|
||||
BIWEIGHT_PROLOGUE
|
||||
BIWEIGHT_SETUP %1
|
||||
lea r4, [t1*2]
|
||||
sar r3d, 1
|
||||
lea r4, [r2*2]
|
||||
.nextrow
|
||||
BIWEIGHT %1, 0, t1
|
||||
BIWEIGHT %1, 0, r2
|
||||
movh [r0 ], m0
|
||||
movhps [r0+t1], m0
|
||||
movhps [r0+r2], m0
|
||||
add r0, r4
|
||||
add r1, r4
|
||||
dec t0
|
||||
dec r3d
|
||||
jnz .nextrow
|
||||
REP_RET
|
||||
|
||||
cglobal h264_biweight_4x4_10_%1
|
||||
mov t0, 2
|
||||
jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
|
||||
|
||||
cglobal h264_biweight_4x2_10_%1
|
||||
mov t0, 1
|
||||
jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
|
||||
%endmacro
|
||||
|
||||
INIT_XMM
|
||||
|
@ -298,57 +298,47 @@ LF_IFUNC(v, luma_intra, 10, mmxext)
|
||||
/***********************************/
|
||||
/* weighted prediction */
|
||||
|
||||
#define H264_WEIGHT(W, H, OPT) \
|
||||
void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
|
||||
int stride, int log2_denom, int weight, int offset);
|
||||
#define H264_WEIGHT(W, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, \
|
||||
int stride, int height, int log2_denom, int weight, int offset);
|
||||
|
||||
#define H264_BIWEIGHT(W, H, OPT) \
|
||||
void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
|
||||
uint8_t *src, int stride, int log2_denom, int weightd, \
|
||||
#define H264_BIWEIGHT(W, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, \
|
||||
uint8_t *src, int stride, int height, int log2_denom, int weightd, \
|
||||
int weights, int offset);
|
||||
|
||||
#define H264_BIWEIGHT_MMX(W,H) \
|
||||
H264_WEIGHT (W, H, mmx2) \
|
||||
H264_BIWEIGHT(W, H, mmx2)
|
||||
#define H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT (W, mmx2) \
|
||||
H264_BIWEIGHT(W, mmx2)
|
||||
|
||||
#define H264_BIWEIGHT_MMX_SSE(W,H) \
|
||||
H264_BIWEIGHT_MMX(W, H) \
|
||||
H264_WEIGHT (W, H, sse2) \
|
||||
H264_BIWEIGHT (W, H, sse2) \
|
||||
H264_BIWEIGHT (W, H, ssse3)
|
||||
#define H264_BIWEIGHT_MMX_SSE(W) \
|
||||
H264_BIWEIGHT_MMX(W) \
|
||||
H264_WEIGHT (W, sse2) \
|
||||
H264_BIWEIGHT (W, sse2) \
|
||||
H264_BIWEIGHT (W, ssse3)
|
||||
|
||||
H264_BIWEIGHT_MMX_SSE(16, 16)
|
||||
H264_BIWEIGHT_MMX_SSE(16, 8)
|
||||
H264_BIWEIGHT_MMX_SSE( 8, 16)
|
||||
H264_BIWEIGHT_MMX_SSE( 8, 8)
|
||||
H264_BIWEIGHT_MMX_SSE( 8, 4)
|
||||
H264_BIWEIGHT_MMX ( 4, 8)
|
||||
H264_BIWEIGHT_MMX ( 4, 4)
|
||||
H264_BIWEIGHT_MMX ( 4, 2)
|
||||
H264_BIWEIGHT_MMX_SSE(16)
|
||||
H264_BIWEIGHT_MMX_SSE( 8)
|
||||
H264_BIWEIGHT_MMX ( 4)
|
||||
|
||||
#define H264_WEIGHT_10(W, H, DEPTH, OPT) \
|
||||
void ff_h264_weight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int stride, int log2_denom, int weight, int offset);
|
||||
#define H264_WEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
|
||||
int stride, int height, int log2_denom, int weight, int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10(W, H, DEPTH, OPT) \
|
||||
void ff_h264_biweight_ ## W ## x ## H ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, uint8_t *src, int stride, int log2_denom, \
|
||||
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
|
||||
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT \
|
||||
(uint8_t *dst, uint8_t *src, int stride, int height, int log2_denom, \
|
||||
int weightd, int weights, int offset);
|
||||
|
||||
#define H264_BIWEIGHT_10_SSE(W, H, DEPTH) \
|
||||
H264_WEIGHT_10 (W, H, DEPTH, sse2) \
|
||||
H264_WEIGHT_10 (W, H, DEPTH, sse4) \
|
||||
H264_BIWEIGHT_10(W, H, DEPTH, sse2) \
|
||||
H264_BIWEIGHT_10(W, H, DEPTH, sse4)
|
||||
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
|
||||
H264_WEIGHT_10 (W, DEPTH, sse2) \
|
||||
H264_WEIGHT_10 (W, DEPTH, sse4) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse2) \
|
||||
H264_BIWEIGHT_10(W, DEPTH, sse4)
|
||||
|
||||
H264_BIWEIGHT_10_SSE(16, 16, 10)
|
||||
H264_BIWEIGHT_10_SSE(16, 8, 10)
|
||||
H264_BIWEIGHT_10_SSE( 8, 16, 10)
|
||||
H264_BIWEIGHT_10_SSE( 8, 8, 10)
|
||||
H264_BIWEIGHT_10_SSE( 8, 4, 10)
|
||||
H264_BIWEIGHT_10_SSE( 4, 8, 10)
|
||||
H264_BIWEIGHT_10_SSE( 4, 4, 10)
|
||||
H264_BIWEIGHT_10_SSE( 4, 2, 10)
|
||||
H264_BIWEIGHT_10_SSE(16, 10)
|
||||
H264_BIWEIGHT_10_SSE( 8, 10)
|
||||
H264_BIWEIGHT_10_SSE( 4, 10)
|
||||
|
||||
void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chroma_format_idc)
|
||||
{
|
||||
@ -394,23 +384,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
|
||||
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
|
||||
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
|
||||
#endif
|
||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
|
||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
|
||||
c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
|
||||
c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
|
||||
c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
|
||||
c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
|
||||
c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
|
||||
c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
|
||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16_mmx2;
|
||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_8_mmx2;
|
||||
c->weight_h264_pixels_tab[2]= ff_h264_weight_4_mmx2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
|
||||
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
|
||||
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
|
||||
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
|
||||
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
|
||||
c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
|
||||
c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
|
||||
c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
|
||||
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_mmx2;
|
||||
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_mmx2;
|
||||
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_4_mmx2;
|
||||
|
||||
if (mm_flags&AV_CPU_FLAG_SSE2) {
|
||||
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
|
||||
@ -422,17 +402,11 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
|
||||
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
|
||||
c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
|
||||
|
||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
|
||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
|
||||
c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
|
||||
c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
|
||||
c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
|
||||
c->weight_h264_pixels_tab[0]= ff_h264_weight_16_sse2;
|
||||
c->weight_h264_pixels_tab[1]= ff_h264_weight_8_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
|
||||
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
|
||||
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
|
||||
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
|
||||
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
|
||||
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_sse2;
|
||||
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_sse2;
|
||||
|
||||
#if HAVE_ALIGNED_STACK
|
||||
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
|
||||
@ -442,11 +416,8 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
|
||||
#endif
|
||||
}
|
||||
if (mm_flags&AV_CPU_FLAG_SSSE3) {
|
||||
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
|
||||
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
|
||||
c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
|
||||
c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
|
||||
c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
|
||||
c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16_ssse3;
|
||||
c->biweight_h264_pixels_tab[1]= ff_h264_biweight_8_ssse3;
|
||||
}
|
||||
if (mm_flags&AV_CPU_FLAG_AVX) {
|
||||
#if HAVE_ALIGNED_STACK
|
||||
@ -485,23 +456,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
|
||||
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
|
||||
#endif
|
||||
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse2;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse2;
|
||||
c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse2;
|
||||
c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse2;
|
||||
c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse2;
|
||||
c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse2;
|
||||
c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse2;
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse2;
|
||||
c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse2;
|
||||
c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse2;
|
||||
c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse2;
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
|
||||
|
||||
c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
|
||||
c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
|
||||
@ -513,23 +474,13 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, const int chrom
|
||||
#endif
|
||||
}
|
||||
if (mm_flags&AV_CPU_FLAG_SSE4) {
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16x16_10_sse4;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_16x8_10_sse4;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_8x16_10_sse4;
|
||||
c->weight_h264_pixels_tab[3] = ff_h264_weight_8x8_10_sse4;
|
||||
c->weight_h264_pixels_tab[4] = ff_h264_weight_8x4_10_sse4;
|
||||
c->weight_h264_pixels_tab[5] = ff_h264_weight_4x8_10_sse4;
|
||||
c->weight_h264_pixels_tab[6] = ff_h264_weight_4x4_10_sse4;
|
||||
c->weight_h264_pixels_tab[7] = ff_h264_weight_4x2_10_sse4;
|
||||
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
|
||||
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
|
||||
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
|
||||
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16x16_10_sse4;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_16x8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_8x16_10_sse4;
|
||||
c->biweight_h264_pixels_tab[3] = ff_h264_biweight_8x8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[4] = ff_h264_biweight_8x4_10_sse4;
|
||||
c->biweight_h264_pixels_tab[5] = ff_h264_biweight_4x8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[6] = ff_h264_biweight_4x4_10_sse4;
|
||||
c->biweight_h264_pixels_tab[7] = ff_h264_biweight_4x2_10_sse4;
|
||||
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
|
||||
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
|
||||
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
|
||||
}
|
||||
#if HAVE_AVX
|
||||
if (mm_flags&AV_CPU_FLAG_AVX) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user