ARMv6 optimized fdct4x4
Optimized fdct4x4 (8x4) for ARMv6 instruction set. - No interlocks in Cortex-A8 pipeline - One interlock cycle in ARM11 pipeline - About 2.16 times faster than current C-code compiled with -O3 Change-Id: I60484ecd144365da45bb68a960d30196b59952b8
This commit is contained in:
parent
185557344a
commit
a61785b6a1
@ -59,9 +59,9 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
|
|||||||
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
|
cpi->rtcd.variance.get4x4sse_cs = vp8_get4x4sse_cs_c;*/
|
||||||
|
|
||||||
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
|
/*cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_c;
|
||||||
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;
|
cpi->rtcd.fdct.short8x4 = vp8_short_fdct8x4_c;*/
|
||||||
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_c;
|
cpi->rtcd.fdct.fast4x4 = vp8_fast_fdct4x4_armv6;
|
||||||
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_c;*/
|
cpi->rtcd.fdct.fast8x4 = vp8_fast_fdct8x4_armv6;
|
||||||
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6;
|
cpi->rtcd.fdct.walsh_short4x4 = vp8_short_walsh4x4_armv6;
|
||||||
|
|
||||||
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
|
/*cpi->rtcd.encodemb.berr = vp8_block_error_c;
|
||||||
|
262
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
Normal file
262
vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
Normal file
@ -0,0 +1,262 @@
|
|||||||
|
;
|
||||||
|
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||||
|
;
|
||||||
|
; Use of this source code is governed by a BSD-style license
|
||||||
|
; that can be found in the LICENSE file in the root of the source
|
||||||
|
; tree. An additional intellectual property rights grant can be found
|
||||||
|
; in the file PATENTS. All contributing project authors may
|
||||||
|
; be found in the AUTHORS file in the root of the source tree.
|
||||||
|
;
|
||||||
|
|
||||||
|
EXPORT |vp8_fast_fdct4x4_armv6|
|
||||||
|
|
||||||
|
ARM
|
||||||
|
REQUIRE8
|
||||||
|
PRESERVE8
|
||||||
|
|
||||||
|
AREA |.text|, CODE, READONLY
|
||||||
|
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
||||||
|
|vp8_fast_fdct4x4_armv6| PROC
|
||||||
|
|
||||||
|
stmfd sp!, {r4 - r12, lr}
|
||||||
|
|
||||||
|
; PART 1
|
||||||
|
|
||||||
|
; coeffs 0-3
|
||||||
|
ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
|
||||||
|
|
||||||
|
ldr r10, c7500
|
||||||
|
ldr r11, c14500
|
||||||
|
ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
|
||||||
|
ldr lr, c0x00080008
|
||||||
|
ror r5, r5, #16 ; [i2 | i3]
|
||||||
|
|
||||||
|
qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
|
||||||
|
qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
|
||||||
|
|
||||||
|
add r0, r0, r2 ; update input pointer
|
||||||
|
|
||||||
|
qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
|
||||||
|
; with 2217*4 and 5352*4 without losing the
|
||||||
|
; sign bit (overflow)
|
||||||
|
|
||||||
|
smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
|
||||||
|
smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
|
||||||
|
|
||||||
|
smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||||
|
smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||||
|
|
||||||
|
ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
|
||||||
|
|
||||||
|
pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
|
||||||
|
pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
|
||||||
|
|
||||||
|
str r6, [r1, #4]
|
||||||
|
|
||||||
|
; coeffs 4-7
|
||||||
|
ror r9, r9, #16 ; [i6 | i7]
|
||||||
|
|
||||||
|
qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
|
||||||
|
qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
|
||||||
|
|
||||||
|
add r0, r0, r2 ; update input pointer
|
||||||
|
|
||||||
|
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||||
|
; with 2217*4 and 5352*4 without losing the
|
||||||
|
; sign bit (overflow)
|
||||||
|
|
||||||
|
smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
|
||||||
|
smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
|
||||||
|
|
||||||
|
smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||||
|
smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||||
|
|
||||||
|
ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
|
||||||
|
|
||||||
|
pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
|
||||||
|
pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
|
||||||
|
|
||||||
|
str r6, [r1, #12]
|
||||||
|
|
||||||
|
; coeffs 8-11
|
||||||
|
ror r5, r5, #16 ; [i10 | i11]
|
||||||
|
|
||||||
|
qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
|
||||||
|
qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
|
||||||
|
|
||||||
|
add r0, r0, r2 ; update input pointer
|
||||||
|
|
||||||
|
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||||
|
; with 2217*4 and 5352*4 without losing the
|
||||||
|
; sign bit (overflow)
|
||||||
|
|
||||||
|
smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
|
||||||
|
smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
|
||||||
|
|
||||||
|
smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||||
|
smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||||
|
|
||||||
|
ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
|
||||||
|
|
||||||
|
pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
|
||||||
|
pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
|
||||||
|
|
||||||
|
str r6, [r1, #20]
|
||||||
|
|
||||||
|
; coeffs 12-15
|
||||||
|
ror r5, r5, #16 ; [i14 | i15]
|
||||||
|
|
||||||
|
qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
|
||||||
|
qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
|
||||||
|
|
||||||
|
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||||
|
; with 2217*4 and 5352*4 without losing the
|
||||||
|
; sign bit (overflow)
|
||||||
|
|
||||||
|
smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
|
||||||
|
smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
|
||||||
|
|
||||||
|
smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||||
|
smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||||
|
|
||||||
|
pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
|
||||||
|
pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
|
||||||
|
|
||||||
|
str r6, [r1, #28]
|
||||||
|
|
||||||
|
|
||||||
|
; PART 2 -------------------------------------------------
|
||||||
|
ldr r11, c12000
|
||||||
|
ldr r10, c51000
|
||||||
|
ldr lr, c0x00070007
|
||||||
|
|
||||||
|
qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
|
||||||
|
qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
|
||||||
|
qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
|
||||||
|
qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
|
||||||
|
|
||||||
|
qadd16 r4, r4, lr ; a1 + 7
|
||||||
|
|
||||||
|
add r0, r11, #0x10000 ; add (d!=0)
|
||||||
|
|
||||||
|
qadd16 r2, r4, r5 ; a1 + b1 + 7
|
||||||
|
qsub16 r3, r4, r5 ; a1 - b1 + 7
|
||||||
|
|
||||||
|
ldr r12, c0x08a914e8 ; [2217 | 5352]
|
||||||
|
|
||||||
|
lsl r8, r2, #16 ; prepare bottom halfword for scaling
|
||||||
|
asr r2, r2, #4 ; scale top halfword
|
||||||
|
lsl r9, r3, #16 ; prepare bottom halfword for scaling
|
||||||
|
asr r3, r3, #4 ; scale top halfword
|
||||||
|
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
|
||||||
|
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
|
||||||
|
|
||||||
|
smulbt r2, r6, r12 ; [ ------ | c1*2217]
|
||||||
|
str r4, [r1, #0] ; [ o1 | o0]
|
||||||
|
smultt r3, r6, r12 ; [c1*2217 | ------ ]
|
||||||
|
str r5, [r1, #16] ; [ o9 | o8]
|
||||||
|
|
||||||
|
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
|
||||||
|
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
|
||||||
|
|
||||||
|
smulbb r2, r6, r12 ; [ ------ | c1*5352]
|
||||||
|
smultb r3, r6, r12 ; [c1*5352 | ------ ]
|
||||||
|
|
||||||
|
lsls r6, r7, #16 ; d1 != 0 ?
|
||||||
|
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
|
||||||
|
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
|
||||||
|
asrs r6, r7, #16
|
||||||
|
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
|
||||||
|
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
|
||||||
|
|
||||||
|
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
|
||||||
|
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
|
||||||
|
|
||||||
|
pkhtb r9, r9, r8, asr #16
|
||||||
|
|
||||||
|
sub r4, r4, r2
|
||||||
|
sub r5, r5, r3
|
||||||
|
|
||||||
|
ldr r3, [r1, #4] ; [i3 | i2]
|
||||||
|
|
||||||
|
pkhtb r5, r5, r4, asr #16 ; [o13|o12]
|
||||||
|
|
||||||
|
str r9, [r1, #8] ; [o5 | 04]
|
||||||
|
|
||||||
|
ldr r9, [r1, #12] ; [i7 | i6]
|
||||||
|
ldr r8, [r1, #28] ; [i15|i14]
|
||||||
|
ldr r2, [r1, #20] ; [i11|i10]
|
||||||
|
str r5, [r1, #24] ; [o13|o12]
|
||||||
|
|
||||||
|
qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
|
||||||
|
qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
|
||||||
|
|
||||||
|
qadd16 r4, r4, lr ; a1 + 7
|
||||||
|
|
||||||
|
qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
|
||||||
|
qadd16 r2, r4, r5 ; a1 + b1 + 7
|
||||||
|
qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
|
||||||
|
qsub16 r3, r4, r5 ; a1 - b1 + 7
|
||||||
|
|
||||||
|
lsl r8, r2, #16 ; prepare bottom halfword for scaling
|
||||||
|
asr r2, r2, #4 ; scale top halfword
|
||||||
|
lsl r9, r3, #16 ; prepare bottom halfword for scaling
|
||||||
|
asr r3, r3, #4 ; scale top halfword
|
||||||
|
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
|
||||||
|
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
|
||||||
|
|
||||||
|
smulbt r2, r6, r12 ; [ ------ | c1*2217]
|
||||||
|
str r4, [r1, #4] ; [ o3 | o2]
|
||||||
|
smultt r3, r6, r12 ; [c1*2217 | ------ ]
|
||||||
|
str r5, [r1, #20] ; [ o11 | o10]
|
||||||
|
|
||||||
|
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
|
||||||
|
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
|
||||||
|
|
||||||
|
smulbb r2, r6, r12 ; [ ------ | c1*5352]
|
||||||
|
smultb r3, r6, r12 ; [c1*5352 | ------ ]
|
||||||
|
|
||||||
|
lsls r6, r7, #16 ; d1 != 0 ?
|
||||||
|
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
|
||||||
|
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
|
||||||
|
|
||||||
|
asrs r6, r7, #16
|
||||||
|
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
|
||||||
|
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
|
||||||
|
|
||||||
|
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
|
||||||
|
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
|
||||||
|
|
||||||
|
pkhtb r9, r9, r8, asr #16
|
||||||
|
|
||||||
|
sub r4, r4, r2
|
||||||
|
sub r5, r5, r3
|
||||||
|
|
||||||
|
str r9, [r1, #12] ; [o7 | o6]
|
||||||
|
pkhtb r5, r5, r4, asr #16 ; [o15|o14]
|
||||||
|
|
||||||
|
str r5, [r1, #28] ; [o15|o14]
|
||||||
|
|
||||||
|
ldmfd sp!, {r4 - r12, pc}
|
||||||
|
|
||||||
|
ENDP
|
||||||
|
|
||||||
|
; Used constants
|
||||||
|
c7500
|
||||||
|
DCD 7500
|
||||||
|
c14500
|
||||||
|
DCD 14500
|
||||||
|
c0x22a453a0
|
||||||
|
DCD 0x22a453a0
|
||||||
|
c0x00080008
|
||||||
|
DCD 0x00080008
|
||||||
|
c12000
|
||||||
|
DCD 12000
|
||||||
|
c51000
|
||||||
|
DCD 51000
|
||||||
|
c0x00070007
|
||||||
|
DCD 0x00070007
|
||||||
|
c0x08a914e8
|
||||||
|
DCD 0x08a914e8
|
||||||
|
|
||||||
|
END
|
24
vp8/encoder/arm/dct_arm.c
Normal file
24
vp8/encoder/arm/dct_arm.c
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "vpx_config.h"
|
||||||
|
#include "vp8/encoder/dct.h"
|
||||||
|
|
||||||
|
#if HAVE_ARMV6
|
||||||
|
|
||||||
|
void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
|
||||||
|
{
|
||||||
|
vp8_fast_fdct4x4_armv6(input, output, pitch);
|
||||||
|
vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* HAVE_ARMV6 */
|
||||||
|
|
||||||
|
|
@ -14,12 +14,21 @@
|
|||||||
|
|
||||||
#if HAVE_ARMV6
|
#if HAVE_ARMV6
|
||||||
extern prototype_fdct(vp8_short_walsh4x4_armv6);
|
extern prototype_fdct(vp8_short_walsh4x4_armv6);
|
||||||
|
extern prototype_fdct(vp8_fast_fdct4x4_armv6);
|
||||||
|
extern prototype_fdct(vp8_fast_fdct8x4_armv6);
|
||||||
|
|
||||||
#if !CONFIG_RUNTIME_CPU_DETECT
|
#if !CONFIG_RUNTIME_CPU_DETECT
|
||||||
#undef vp8_fdct_walsh_short4x4
|
#undef vp8_fdct_walsh_short4x4
|
||||||
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
|
#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
|
||||||
|
|
||||||
|
#undef vp8_fdct_fast4x4
|
||||||
|
#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
|
||||||
|
|
||||||
|
#undef vp8_fdct_fast8x4
|
||||||
|
#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
|
||||||
#endif
|
#endif
|
||||||
#endif
|
|
||||||
|
#endif /* HAVE_ARMV6 */
|
||||||
|
|
||||||
#if HAVE_ARMV7
|
#if HAVE_ARMV7
|
||||||
extern prototype_fdct(vp8_short_fdct4x4_neon);
|
extern prototype_fdct(vp8_short_fdct4x4_neon);
|
||||||
|
@ -19,6 +19,7 @@ VP8_CX_SRCS-$(ARCH_ARM) += encoder/asm_enc_offsets.c
|
|||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/encodemb_arm.c
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/quantize_arm.c
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV7) += encoder/arm/picklpf_arm.c
|
||||||
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/dct_arm.c
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.c
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.h
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/variance_arm.h
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
|
VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
|
||||||
@ -34,6 +35,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/armv5te/vp8_packtokens_partitions_ar
|
|||||||
|
|
||||||
#File list for armv6
|
#File list for armv6
|
||||||
# encoder
|
# encoder
|
||||||
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
|
||||||
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
|
VP8_CX_SRCS-$(HAVE_ARMV6) += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)
|
||||||
|
Loading…
Reference in New Issue
Block a user