Merge remote branch 'internal/upstream' into HEAD

2011-03-24 00:05:05 -04:00
parent bf12748be3 4cde2ab765
commit 1f1526f8b8
5 changed files with 301 additions and 4 deletions
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -59,9 +59,9 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
        /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
-        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;*/
-        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_armv6;
-        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;*/
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_armv6;
        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
        /*cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
--- a/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_fast_fdct4x4_armv6.asm
@@ -0,0 +1,262 @@
 ;
 ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 ;
 ;  Use of this source code is governed by a BSD-style license
 ;  that can be found in the LICENSE file in the root of the source
 ;  tree. An additional intellectual property rights grant can be found
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
    EXPORT |vp8_fast_fdct4x4_armv6|
    ARM
    REQUIRE8
    PRESERVE8
    AREA    |.text|, CODE, READONLY
 ; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
 |vp8_fast_fdct4x4_armv6| PROC
    stmfd       sp!, {r4 - r12, lr}
    ; PART 1
    ; coeffs 0-3
    ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2]
    ldr         r10, c7500
    ldr         r11, c14500
    ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4]
    ldr         lr, c0x00080008
    ror         r5, r5, #16         ; [i2 | i3]
    qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift
    qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift
    add         r0, r0, r2          ; update input pointer
    qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd
                                    ; with 2217*4 and 5352*4 without losing the
                                    ; sign bit (overflow)
    smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8
    smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8
    smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500)
    smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500)
    ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6]
    pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2
    pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2]
    str         r6, [r1, #4]
    ; coeffs 4-7
    ror         r9, r9, #16         ; [i6 | i7]
    qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift
    qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift
    add         r0, r0, r2          ; update input pointer
    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
                                    ; with 2217*4 and 5352*4 without losing the
                                    ; sign bit (overflow)
    smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8
    smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8
    smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500)
    smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500)
    ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10]
    pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2
    pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6]
    str         r6, [r1, #12]
    ; coeffs 8-11
    ror         r5, r5, #16         ; [i10 | i11]
    qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift
    qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift
    add         r0, r0, r2          ; update input pointer
    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
                                    ; with 2217*4 and 5352*4 without losing the
                                    ; sign bit (overflow)
    smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8
    smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8
    smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500)
    smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500)
    ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14]
    pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2
    pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10]
    str         r6, [r1, #20]
    ; coeffs 12-15
    ror         r5, r5, #16         ; [i14 | i15]
    qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift
    qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift
    qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd
                                    ; with 2217*4 and 5352*4 without losing the
                                    ; sign bit (overflow)
    smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8
    smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8
    smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500)
    smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500)
    pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2
    pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14]
    str         r6, [r1, #28]
    ; PART 2 -------------------------------------------------
    ldr         r11, c12000
    ldr         r10, c51000
    ldr         lr, c0x00070007
    qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12]
    qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8]
    qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8]
    qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12]
    qadd16      r4, r4, lr          ; a1 + 7
    add         r0, r11, #0x10000   ; add (d!=0)
    qadd16      r2, r4, r5          ; a1 + b1 + 7
    qsub16      r3, r4, r5          ; a1 - b1 + 7
    ldr         r12, c0x08a914e8    ; [2217 | 5352]
    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
    asr         r2, r2, #4          ; scale top halfword
    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
    asr         r3, r3, #4          ; scale top halfword
    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
    str         r4, [r1, #0]        ; [     o1 |      o0]
    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
    str         r5, [r1, #16]       ; [     o9 |      o8]
    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
    lsls        r6, r7, #16         ; d1 != 0 ?
    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
    asrs        r6, r7, #16
    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
    pkhtb       r9, r9, r8, asr #16
    sub         r4, r4, r2
    sub         r5, r5, r3
    ldr         r3, [r1, #4]        ; [i3 | i2]
    pkhtb       r5, r5, r4, asr #16 ; [o13|o12]
    str         r9, [r1, #8]        ; [o5 | 04]
    ldr         r9, [r1, #12]       ; [i7 | i6]
    ldr         r8, [r1, #28]       ; [i15|i14]
    ldr         r2, [r1, #20]       ; [i11|i10]
    str         r5, [r1, #24]       ; [o13|o12]
    qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14]
    qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10]
    qadd16      r4, r4, lr          ; a1 + 7
    qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10]
    qadd16      r2, r4, r5          ; a1 + b1 + 7
    qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14]
    qsub16      r3, r4, r5          ; a1 - b1 + 7
    lsl         r8, r2, #16         ; prepare bottom halfword for scaling
    asr         r2, r2, #4          ; scale top halfword
    lsl         r9, r3, #16         ; prepare bottom halfword for scaling
    asr         r3, r3, #4          ; scale top halfword
    pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword
    pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword
    smulbt      r2, r6, r12         ; [ ------ | c1*2217]
    str         r4, [r1, #4]        ; [     o3 |      o2]
    smultt      r3, r6, r12         ; [c1*2217 | ------ ]
    str         r5, [r1, #20]       ; [    o11 |     o10]
    smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352]
    smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ]
    smulbb      r2, r6, r12         ; [ ------ | c1*5352]
    smultb      r3, r6, r12         ; [c1*5352 | ------ ]
    lsls        r6, r7, #16         ; d1 != 0 ?
    addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0)
    addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0)
    asrs        r6, r7, #16
    addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0)
    addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0)
    smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000
    smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000
    pkhtb       r9, r9, r8, asr #16
    sub         r4, r4, r2
    sub         r5, r5, r3
    str         r9, [r1, #12]       ; [o7 | o6]
    pkhtb       r5, r5, r4, asr #16 ; [o15|o14]
    str         r5, [r1, #28]       ; [o15|o14]
    ldmfd       sp!, {r4 - r12, pc}
    ENDP
 ; Used constants
 c7500
    DCD     7500
 c14500
    DCD     14500
 c0x22a453a0
    DCD     0x22a453a0
 c0x00080008
    DCD     0x00080008
 c12000
    DCD     12000
 c51000
    DCD     51000
 c0x00070007
    DCD     0x00070007
 c0x08a914e8
    DCD     0x08a914e8
    END
--- a/vp8/encoder/arm/dct_arm.c
+++ b/vp8/encoder/arm/dct_arm.c
@@ -0,0 +1,24 @@
 /*
 *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */
 #include "vpx_config.h"
 #include "vp8/encoder/dct.h"
 #if HAVE_ARMV6
 void vp8_fast_fdct8x4_armv6(short *input, short *output, int pitch)
 {
    vp8_fast_fdct4x4_armv6(input,   output,    pitch);
    vp8_fast_fdct4x4_armv6(input + 4, output + 16, pitch);
 }
 #endif /* HAVE_ARMV6 */
--- a/vp8/encoder/arm/dct_arm.h
+++ b/vp8/encoder/arm/dct_arm.h
@@ -14,12 +14,21 @@
 #if HAVE_ARMV6
 extern prototype_fdct(vp8_short_walsh4x4_armv6);
 extern prototype_fdct(vp8_fast_fdct4x4_armv6);
 extern prototype_fdct(vp8_fast_fdct8x4_armv6);
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_fdct_walsh_short4x4
 #define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
 #undef  vp8_fdct_fast4x4
 #define vp8_fdct_fast4x4 vp8_fast_fdct4x4_armv6
 #undef  vp8_fdct_fast8x4
 #define vp8_fdct_fast8x4 vp8_fast_fdct8x4_armv6
 #endif
-#endif
+
 #endif /* HAVE_ARMV6 */
 #if HAVE_ARMV7
 extern prototype_fdct(vp8_short_fdct4x4_neon);
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -19,6 +19,7 @@ VP8_CX_SRCS-$(ARCH_ARM)  += encoder/asm_enc_offsets.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/encodemb_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/dct_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/variance_arm.h
 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
@@ -34,6 +35,7 @@ VP8_CX_SRCS-$(HAVE_ARMV5TE)  += encoder/arm/armv5te/vp8_packtokens_partitions_ar
 #File list for armv6
 # encoder
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_fdct4x4_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_fast_quantize_b_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_sad16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_ARMV6)  += encoder/arm/armv6/vp8_variance16x16_armv6$(ASM)