ARM: NEON optimised H.264 biweighted prediction
Originally committed as revision 16770 to svn://svn.ffmpeg.org/ffmpeg/trunk
This commit is contained in:
parent
1615fb91a1
commit
5a29589b81
@ -92,6 +92,31 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
|||||||
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
|
||||||
int beta, int8_t *tc0);
|
int beta, int8_t *tc0);
|
||||||
|
|
||||||
|
void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
|
||||||
|
int log2_den, int weightd, int weights,
|
||||||
|
int offset);
|
||||||
|
|
||||||
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
|
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
|
||||||
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
|
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
|
||||||
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
void ff_h264_idct_add16_neon(uint8_t *dst, const int *block_offset,
|
||||||
@ -176,6 +201,15 @@ void ff_dsputil_init_neon(DSPContext *c, AVCodecContext *avctx)
|
|||||||
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
|
||||||
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
|
||||||
|
|
||||||
|
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
|
||||||
|
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
|
||||||
|
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
|
||||||
|
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
|
||||||
|
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
|
||||||
|
c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
|
||||||
|
c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
|
||||||
|
c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
|
||||||
|
|
||||||
c->h264_idct_add = ff_h264_idct_add_neon;
|
c->h264_idct_add = ff_h264_idct_add_neon;
|
||||||
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
|
||||||
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
c->h264_idct_add16 = ff_h264_idct_add16_neon;
|
||||||
|
@ -1368,3 +1368,171 @@ function ff_put_h264_qpel16_mc33_neon, export=1
|
|||||||
sub r1, r1, #1
|
sub r1, r1, #1
|
||||||
b put_h264_qpel16_mc11
|
b put_h264_qpel16_mc11
|
||||||
.endfunc
|
.endfunc
|
||||||
|
|
||||||
|
@ Biweighted prediction
|
||||||
|
|
||||||
|
.macro biweight_16 macs, macd
|
||||||
|
vdup.8 d0, r4
|
||||||
|
vdup.8 d1, r5
|
||||||
|
vmov q2, q8
|
||||||
|
vmov q3, q8
|
||||||
|
1: subs ip, ip, #2
|
||||||
|
vld1.8 {d20-d21},[r0,:128], r2
|
||||||
|
\macd q2, d0, d20
|
||||||
|
pld [r0]
|
||||||
|
\macd q3, d0, d21
|
||||||
|
vld1.8 {d22-d23},[r1,:128], r2
|
||||||
|
\macs q2, d1, d22
|
||||||
|
pld [r1]
|
||||||
|
\macs q3, d1, d23
|
||||||
|
vmov q12, q8
|
||||||
|
vld1.8 {d28-d29},[r0,:128], r2
|
||||||
|
vmov q13, q8
|
||||||
|
\macd q12, d0, d28
|
||||||
|
pld [r0]
|
||||||
|
\macd q13, d0, d29
|
||||||
|
vld1.8 {d30-d31},[r1,:128], r2
|
||||||
|
\macs q12, d1, d30
|
||||||
|
pld [r1]
|
||||||
|
\macs q13, d1, d31
|
||||||
|
vshl.s16 q2, q2, q9
|
||||||
|
vshl.s16 q3, q3, q9
|
||||||
|
vqmovun.s16 d4, q2
|
||||||
|
vqmovun.s16 d5, q3
|
||||||
|
vshl.s16 q12, q12, q9
|
||||||
|
vshl.s16 q13, q13, q9
|
||||||
|
vqmovun.s16 d24, q12
|
||||||
|
vqmovun.s16 d25, q13
|
||||||
|
vmov q3, q8
|
||||||
|
vst1.8 {d4- d5}, [r6,:128], r2
|
||||||
|
vmov q2, q8
|
||||||
|
vst1.8 {d24-d25},[r6,:128], r2
|
||||||
|
bne 1b
|
||||||
|
pop {r4-r6, pc}
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro biweight_8 macs, macd
|
||||||
|
vdup.8 d0, r4
|
||||||
|
vdup.8 d1, r5
|
||||||
|
vmov q1, q8
|
||||||
|
vmov q10, q8
|
||||||
|
1: subs ip, ip, #2
|
||||||
|
vld1.8 {d4},[r0,:64], r2
|
||||||
|
\macd q1, d0, d4
|
||||||
|
pld [r0]
|
||||||
|
vld1.8 {d5},[r1,:64], r2
|
||||||
|
\macs q1, d1, d5
|
||||||
|
pld [r1]
|
||||||
|
vld1.8 {d6},[r0,:64], r2
|
||||||
|
\macd q10, d0, d6
|
||||||
|
pld [r0]
|
||||||
|
vld1.8 {d7},[r1,:64], r2
|
||||||
|
\macs q10, d1, d7
|
||||||
|
pld [r1]
|
||||||
|
vshl.s16 q1, q1, q9
|
||||||
|
vqmovun.s16 d2, q1
|
||||||
|
vshl.s16 q10, q10, q9
|
||||||
|
vqmovun.s16 d4, q10
|
||||||
|
vmov q10, q8
|
||||||
|
vst1.8 {d2},[r6,:64], r2
|
||||||
|
vmov q1, q8
|
||||||
|
vst1.8 {d4},[r6,:64], r2
|
||||||
|
bne 1b
|
||||||
|
pop {r4-r6, pc}
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro biweight_4 macs, macd
|
||||||
|
vdup.8 d0, r4
|
||||||
|
vdup.8 d1, r5
|
||||||
|
vmov q1, q8
|
||||||
|
vmov q10, q8
|
||||||
|
1: subs ip, ip, #4
|
||||||
|
vld1.32 {d4[0]},[r0,:32], r2
|
||||||
|
vld1.32 {d4[1]},[r0,:32], r2
|
||||||
|
\macd q1, d0, d4
|
||||||
|
pld [r0]
|
||||||
|
vld1.32 {d5[0]},[r1,:32], r2
|
||||||
|
vld1.32 {d5[1]},[r1,:32], r2
|
||||||
|
\macs q1, d1, d5
|
||||||
|
pld [r1]
|
||||||
|
blt 2f
|
||||||
|
vld1.32 {d6[0]},[r0,:32], r2
|
||||||
|
vld1.32 {d6[1]},[r0,:32], r2
|
||||||
|
\macd q10, d0, d6
|
||||||
|
pld [r0]
|
||||||
|
vld1.32 {d7[0]},[r1,:32], r2
|
||||||
|
vld1.32 {d7[1]},[r1,:32], r2
|
||||||
|
\macs q10, d1, d7
|
||||||
|
pld [r1]
|
||||||
|
vshl.s16 q1, q1, q9
|
||||||
|
vqmovun.s16 d2, q1
|
||||||
|
vshl.s16 q10, q10, q9
|
||||||
|
vqmovun.s16 d4, q10
|
||||||
|
vmov q10, q8
|
||||||
|
vst1.32 {d2[0]},[r6,:32], r2
|
||||||
|
vst1.32 {d2[1]},[r6,:32], r2
|
||||||
|
vmov q1, q8
|
||||||
|
vst1.32 {d4[0]},[r6,:32], r2
|
||||||
|
vst1.32 {d4[1]},[r6,:32], r2
|
||||||
|
bne 1b
|
||||||
|
pop {r4-r6, pc}
|
||||||
|
2: vshl.s16 q1, q1, q9
|
||||||
|
vqmovun.s16 d2, q1
|
||||||
|
vst1.32 {d2[0]},[r6,:32], r2
|
||||||
|
vst1.32 {d2[1]},[r6,:32], r2
|
||||||
|
pop {r4-r6, pc}
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro biweight_func w
|
||||||
|
function biweight_h264_pixels_\w\()_neon
|
||||||
|
push {r4-r6, lr}
|
||||||
|
add r4, sp, #16
|
||||||
|
ldm r4, {r4-r6}
|
||||||
|
lsr lr, r4, #31
|
||||||
|
add r6, r6, #1
|
||||||
|
eors lr, lr, r5, lsr #30
|
||||||
|
orr r6, r6, #1
|
||||||
|
vdup.16 q9, r3
|
||||||
|
lsl r6, r6, r3
|
||||||
|
vmvn q9, q9
|
||||||
|
vdup.16 q8, r6
|
||||||
|
mov r6, r0
|
||||||
|
beq 10f
|
||||||
|
subs lr, lr, #1
|
||||||
|
beq 20f
|
||||||
|
subs lr, lr, #1
|
||||||
|
beq 30f
|
||||||
|
b 40f
|
||||||
|
10: biweight_\w vmlal.u8, vmlal.u8
|
||||||
|
20: rsb r4, r4, #0
|
||||||
|
biweight_\w vmlal.u8, vmlsl.u8
|
||||||
|
30: rsb r4, r4, #0
|
||||||
|
rsb r5, r5, #0
|
||||||
|
biweight_\w vmlsl.u8, vmlsl.u8
|
||||||
|
40: rsb r5, r5, #0
|
||||||
|
biweight_\w vmlsl.u8, vmlal.u8
|
||||||
|
.endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro biweight_entry w, h, b=1
|
||||||
|
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
|
||||||
|
mov ip, #\h
|
||||||
|
.if \b
|
||||||
|
b biweight_h264_pixels_\w\()_neon
|
||||||
|
.endif
|
||||||
|
.endfunc
|
||||||
|
.endm
|
||||||
|
|
||||||
|
biweight_entry 16, 8
|
||||||
|
biweight_entry 16, 16, b=0
|
||||||
|
biweight_func 16
|
||||||
|
|
||||||
|
biweight_entry 8, 16
|
||||||
|
biweight_entry 8, 4
|
||||||
|
biweight_entry 8, 8, b=0
|
||||||
|
biweight_func 8
|
||||||
|
|
||||||
|
biweight_entry 4, 8
|
||||||
|
biweight_entry 4, 2
|
||||||
|
biweight_entry 4, 4, b=0
|
||||||
|
biweight_func 4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user