Remove armv6 target
Change-Id: I1fa81cc9cabf362a185fc3a53f1e58de533a41e5
This commit is contained in:
		
							
								
								
									
										3
									
								
								README
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								README
									
									
									
									
									
								
							| @@ -49,9 +49,6 @@ COMPILING THE APPLICATIONS/LIBRARIES: | |||||||
|  |  | ||||||
|     arm64-darwin-gcc |     arm64-darwin-gcc | ||||||
|     arm64-linux-gcc |     arm64-linux-gcc | ||||||
|     armv6-linux-rvct |  | ||||||
|     armv6-linux-gcc |  | ||||||
|     armv6-none-rvct |  | ||||||
|     armv7-android-gcc |     armv7-android-gcc | ||||||
|     armv7-darwin-gcc |     armv7-darwin-gcc | ||||||
|     armv7-linux-rvct |     armv7-linux-rvct | ||||||
|   | |||||||
| @@ -29,11 +29,6 @@ | |||||||
| # include $(CLEAR_VARS) | # include $(CLEAR_VARS) | ||||||
| # include jni/libvpx/build/make/Android.mk | # include jni/libvpx/build/make/Android.mk | ||||||
| # | # | ||||||
| # There are currently two TARGET_ARCH_ABI targets for ARM. |  | ||||||
| # armeabi and armeabi-v7a.  armeabi-v7a is selected by creating an |  | ||||||
| # Application.mk in the jni directory that contains: |  | ||||||
| # APP_ABI := armeabi-v7a |  | ||||||
| # |  | ||||||
| # By default libvpx will detect at runtime the existance of NEON extension. | # By default libvpx will detect at runtime the existance of NEON extension. | ||||||
| # For this we import the 'cpufeatures' module from the NDK sources. | # For this we import the 'cpufeatures' module from the NDK sources. | ||||||
| # libvpx can also be configured without this runtime detection method. | # libvpx can also be configured without this runtime detection method. | ||||||
| @@ -42,9 +37,6 @@ | |||||||
| #     --disable-neon-asm | #     --disable-neon-asm | ||||||
| # will remove any NEON dependency. | # will remove any NEON dependency. | ||||||
|  |  | ||||||
| # To change to building armeabi, run ./libvpx/configure again, but with |  | ||||||
| # --target=armv6-android-gcc and modify the Application.mk file to |  | ||||||
| # set APP_ABI := armeabi |  | ||||||
| # | # | ||||||
| # Running ndk-build will build libvpx and include it in your project. | # Running ndk-build will build libvpx and include it in your project. | ||||||
| # | # | ||||||
| @@ -59,9 +51,6 @@ ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL) | |||||||
| ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) | ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) | ||||||
|   include $(CONFIG_DIR)libs-armv7-android-gcc.mk |   include $(CONFIG_DIR)libs-armv7-android-gcc.mk | ||||||
|   LOCAL_ARM_MODE := arm |   LOCAL_ARM_MODE := arm | ||||||
| else ifeq  ($(TARGET_ARCH_ABI),armeabi) |  | ||||||
|   include $(CONFIG_DIR)libs-armv6-android-gcc.mk |  | ||||||
|   LOCAL_ARM_MODE := arm |  | ||||||
| else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a) | else ifeq  ($(TARGET_ARCH_ABI),arm64-v8a) | ||||||
|   include $(CONFIG_DIR)libs-armv8-android-gcc.mk |   include $(CONFIG_DIR)libs-armv8-android-gcc.mk | ||||||
|   LOCAL_ARM_MODE := arm |   LOCAL_ARM_MODE := arm | ||||||
|   | |||||||
| @@ -680,9 +680,6 @@ process_common_toolchain() { | |||||||
|       aarch64*) |       aarch64*) | ||||||
|         tgt_isa=arm64 |         tgt_isa=arm64 | ||||||
|         ;; |         ;; | ||||||
|       armv6*) |  | ||||||
|         tgt_isa=armv6 |  | ||||||
|         ;; |  | ||||||
|       armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf) |       armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf) | ||||||
|         tgt_isa=armv7 |         tgt_isa=armv7 | ||||||
|         float_abi=hard |         float_abi=hard | ||||||
| @@ -883,36 +880,6 @@ process_common_toolchain() { | |||||||
|           if disabled neon && enabled neon_asm; then |           if disabled neon && enabled neon_asm; then | ||||||
|             die "Disabling neon while keeping neon-asm is not supported" |             die "Disabling neon while keeping neon-asm is not supported" | ||||||
|           fi |           fi | ||||||
|           case ${toolchain} in |  | ||||||
|             # Apple iOS SDKs no longer support armv6 as of the version 9 |  | ||||||
|             # release (coincides with release of Xcode 7). Only enable media |  | ||||||
|             # when using earlier SDK releases. |  | ||||||
|             *-darwin*) |  | ||||||
|               if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then |  | ||||||
|                 soft_enable media |  | ||||||
|               else |  | ||||||
|                 soft_disable media |  | ||||||
|                 RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media " |  | ||||||
|               fi |  | ||||||
|               ;; |  | ||||||
|             *) |  | ||||||
|               soft_enable media |  | ||||||
|               ;; |  | ||||||
|           esac |  | ||||||
|           ;; |  | ||||||
|         armv6) |  | ||||||
|           case ${toolchain} in |  | ||||||
|             *-darwin*) |  | ||||||
|               if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then |  | ||||||
|                 soft_enable media |  | ||||||
|               else |  | ||||||
|                 die "Your iOS SDK does not support armv6." |  | ||||||
|               fi |  | ||||||
|               ;; |  | ||||||
|             *) |  | ||||||
|               soft_enable media |  | ||||||
|               ;; |  | ||||||
|           esac |  | ||||||
|           ;; |           ;; | ||||||
|       esac |       esac | ||||||
|  |  | ||||||
|   | |||||||
| @@ -384,13 +384,8 @@ if ($opts{arch} eq 'x86') { | |||||||
|   } |   } | ||||||
|   close CONFIG_FILE; |   close CONFIG_FILE; | ||||||
|   mips; |   mips; | ||||||
| } elsif ($opts{arch} eq 'armv6') { |  | ||||||
|   @ALL_ARCHS = filter(qw/media/); |  | ||||||
|   arm; |  | ||||||
| } elsif ($opts{arch} =~ /armv7\w?/) { | } elsif ($opts{arch} =~ /armv7\w?/) { | ||||||
|   @ALL_ARCHS = filter(qw/media neon_asm neon/); |   @ALL_ARCHS = filter(qw/neon_asm neon/); | ||||||
|   @REQUIRES = filter(keys %required ? keys %required : qw/media/); |  | ||||||
|   &require(@REQUIRES); |  | ||||||
|   arm; |   arm; | ||||||
| } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { | } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { | ||||||
|   @ALL_ARCHS = filter(qw/neon/); |   @ALL_ARCHS = filter(qw/neon/); | ||||||
|   | |||||||
							
								
								
									
										5
									
								
								configure
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										5
									
								
								configure
									
									
									
									
										vendored
									
									
								
							| @@ -99,9 +99,6 @@ EOF | |||||||
| # alphabetically by architecture, generic-gnu last. | # alphabetically by architecture, generic-gnu last. | ||||||
| all_platforms="${all_platforms} arm64-darwin-gcc" | all_platforms="${all_platforms} arm64-darwin-gcc" | ||||||
| all_platforms="${all_platforms} arm64-linux-gcc" | all_platforms="${all_platforms} arm64-linux-gcc" | ||||||
| all_platforms="${all_platforms} armv6-linux-rvct" |  | ||||||
| all_platforms="${all_platforms} armv6-linux-gcc" |  | ||||||
| all_platforms="${all_platforms} armv6-none-rvct" |  | ||||||
| all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8 | all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8 | ||||||
| all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8 | all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8 | ||||||
| all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8 | all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8 | ||||||
| @@ -236,8 +233,6 @@ ARCH_EXT_LIST_X86=" | |||||||
|     avx2 |     avx2 | ||||||
| " | " | ||||||
| ARCH_EXT_LIST=" | ARCH_EXT_LIST=" | ||||||
|     edsp |  | ||||||
|     media |  | ||||||
|     neon |     neon | ||||||
|     neon_asm |     neon_asm | ||||||
|  |  | ||||||
|   | |||||||
| @@ -640,13 +640,6 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); | |||||||
|  |  | ||||||
| //------------------------------------------------------------------------------ | //------------------------------------------------------------------------------ | ||||||
| // ARM functions | // ARM functions | ||||||
| #if HAVE_MEDIA |  | ||||||
| const SadMxNParam media_tests[] = { |  | ||||||
|   SadMxNParam(16, 16, &vpx_sad16x16_media), |  | ||||||
| }; |  | ||||||
| INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests)); |  | ||||||
| #endif  // HAVE_MEDIA |  | ||||||
|  |  | ||||||
| #if HAVE_NEON | #if HAVE_NEON | ||||||
| const SadMxNParam neon_tests[] = { | const SadMxNParam neon_tests[] = { | ||||||
|   SadMxNParam(64, 64, &vpx_sad64x64_neon), |   SadMxNParam(64, 64, &vpx_sad64x64_neon), | ||||||
|   | |||||||
| @@ -1205,22 +1205,6 @@ INSTANTIATE_TEST_CASE_P( | |||||||
|         make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0))); |         make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0))); | ||||||
| #endif  // HAVE_AVX2 | #endif  // HAVE_AVX2 | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
| INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest, |  | ||||||
|                         ::testing::Values(MseParams(4, 4, |  | ||||||
|                                                     &vpx_mse16x16_media))); |  | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P( |  | ||||||
|     MEDIA, VpxVarianceTest, |  | ||||||
|     ::testing::Values(VarianceParams(4, 4, &vpx_variance16x16_media), |  | ||||||
|                       VarianceParams(3, 3, &vpx_variance8x8_media))); |  | ||||||
|  |  | ||||||
| INSTANTIATE_TEST_CASE_P( |  | ||||||
|     MEDIA, VpxSubpelVarianceTest, |  | ||||||
|     ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_media, 0), |  | ||||||
|                       make_tuple(3, 3, &vpx_sub_pixel_variance8x8_media, 0))); |  | ||||||
| #endif  // HAVE_MEDIA |  | ||||||
|  |  | ||||||
| #if HAVE_NEON | #if HAVE_NEON | ||||||
| INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, | INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, | ||||||
|                         ::testing::Values(SseParams(2, 2, |                         ::testing::Values(SseParams(2, 2, | ||||||
|   | |||||||
| @@ -1,237 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_filter_block2d_bil_first_pass_armv6| |  | ||||||
|     EXPORT  |vp8_filter_block2d_bil_second_pass_armv6| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
|  |  | ||||||
| ;------------------------------------- |  | ||||||
| ; r0    unsigned char  *src_ptr, |  | ||||||
| ; r1    unsigned short *dst_ptr, |  | ||||||
| ; r2    unsigned int    src_pitch, |  | ||||||
| ; r3    unsigned int    height, |  | ||||||
| ; stack unsigned int    width, |  | ||||||
| ; stack const short    *vp8_filter |  | ||||||
| ;------------------------------------- |  | ||||||
| ; The output is transposed stroed in output array to make it easy for second pass filtering. |  | ||||||
| |vp8_filter_block2d_bil_first_pass_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vp8_filter address |  | ||||||
|     ldr     r4, [sp, #36]                   ; width |  | ||||||
|  |  | ||||||
|     mov     r12, r3                         ; outer-loop counter |  | ||||||
|  |  | ||||||
|     add     r7, r2, r4                      ; preload next row |  | ||||||
|     pld     [r0, r7] |  | ||||||
|  |  | ||||||
|     sub     r2, r2, r4                      ; src increment for height loop |  | ||||||
|  |  | ||||||
|     ldr     r5, [r11]                       ; load up filter coefficients |  | ||||||
|  |  | ||||||
|     mov     r3, r3, lsl #1                  ; height*2 |  | ||||||
|     add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) |  | ||||||
|  |  | ||||||
|     mov     r11, r1                         ; save dst_ptr for each row |  | ||||||
|  |  | ||||||
|     cmp     r5, #128                        ; if filter coef = 128, then skip the filter |  | ||||||
|     beq     bil_null_1st_filter |  | ||||||
|  |  | ||||||
| |bil_height_loop_1st_v6| |  | ||||||
|     ldrb    r6, [r0]                        ; load source data |  | ||||||
|     ldrb    r7, [r0, #1] |  | ||||||
|     ldrb    r8, [r0, #2] |  | ||||||
|     mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter |  | ||||||
|  |  | ||||||
| |bil_width_loop_1st_v6| |  | ||||||
|     ldrb    r9, [r0, #3] |  | ||||||
|     ldrb    r10, [r0, #4] |  | ||||||
|  |  | ||||||
|     pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0] |  | ||||||
|     pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1] |  | ||||||
|  |  | ||||||
|     smuad   r6, r6, r5                      ; apply the filter |  | ||||||
|     pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2] |  | ||||||
|     smuad   r7, r7, r5 |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3] |  | ||||||
|  |  | ||||||
|     smuad   r8, r8, r5 |  | ||||||
|     smuad   r9, r9, r5 |  | ||||||
|  |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|  |  | ||||||
|     add     r6, r6, #0x40                   ; round_shift_and_clamp |  | ||||||
|     add     r7, r7, #0x40 |  | ||||||
|     usat    r6, #16, r6, asr #7 |  | ||||||
|     usat    r7, #16, r7, asr #7 |  | ||||||
|  |  | ||||||
|     strh    r6, [r1], r3                    ; result is transposed and stored |  | ||||||
|  |  | ||||||
|     add     r8, r8, #0x40                   ; round_shift_and_clamp |  | ||||||
|     strh    r7, [r1], r3 |  | ||||||
|     add     r9, r9, #0x40 |  | ||||||
|     usat    r8, #16, r8, asr #7 |  | ||||||
|     usat    r9, #16, r9, asr #7 |  | ||||||
|  |  | ||||||
|     strh    r8, [r1], r3                    ; result is transposed and stored |  | ||||||
|  |  | ||||||
|     ldrneb  r6, [r0]                        ; load source data |  | ||||||
|     strh    r9, [r1], r3 |  | ||||||
|  |  | ||||||
|     ldrneb  r7, [r0, #1] |  | ||||||
|     ldrneb  r8, [r0, #2] |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_1st_v6 |  | ||||||
|  |  | ||||||
|     add     r0, r0, r2                      ; move to next input row |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     add     r9, r2, r4, lsl #1              ; adding back block width |  | ||||||
|     pld     [r0, r9]                        ; preload next row |  | ||||||
|  |  | ||||||
|     add     r11, r11, #2                    ; move over to next column |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_1st_v6 |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
| |bil_null_1st_filter| |  | ||||||
| |bil_height_loop_null_1st| |  | ||||||
|     mov     lr, r4, lsr #2                  ; loop counter |  | ||||||
|  |  | ||||||
| |bil_width_loop_null_1st| |  | ||||||
|     ldrb    r6, [r0]                        ; load data |  | ||||||
|     ldrb    r7, [r0, #1] |  | ||||||
|     ldrb    r8, [r0, #2] |  | ||||||
|     ldrb    r9, [r0, #3] |  | ||||||
|  |  | ||||||
|     strh    r6, [r1], r3                    ; store it to immediate buffer |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     strh    r7, [r1], r3 |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|     strh    r8, [r1], r3 |  | ||||||
|     strh    r9, [r1], r3 |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_null_1st |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, r2                      ; move to next input line |  | ||||||
|     add     r11, r11, #2                    ; move over to next column |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_null_1st |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP  ; |vp8_filter_block2d_bil_first_pass_armv6| |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ;--------------------------------- |  | ||||||
| ; r0    unsigned short *src_ptr, |  | ||||||
| ; r1    unsigned char  *dst_ptr, |  | ||||||
| ; r2    int             dst_pitch, |  | ||||||
| ; r3    unsigned int    height, |  | ||||||
| ; stack unsigned int    width, |  | ||||||
| ; stack const short    *vp8_filter |  | ||||||
| ;--------------------------------- |  | ||||||
| |vp8_filter_block2d_bil_second_pass_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vp8_filter address |  | ||||||
|     ldr     r4, [sp, #36]                   ; width |  | ||||||
|  |  | ||||||
|     ldr     r5, [r11]                       ; load up filter coefficients |  | ||||||
|     mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix |  | ||||||
|     mov     r11, r1 |  | ||||||
|  |  | ||||||
|     cmp     r5, #128                        ; if filter coef = 128, then skip the filter |  | ||||||
|     beq     bil_null_2nd_filter |  | ||||||
|  |  | ||||||
| |bil_height_loop_2nd| |  | ||||||
|     ldr     r6, [r0]                        ; load the data |  | ||||||
|     ldr     r8, [r0, #4] |  | ||||||
|     ldrh    r10, [r0, #8] |  | ||||||
|     mov     lr, r3, lsr #2                  ; loop counter |  | ||||||
|  |  | ||||||
| |bil_width_loop_2nd| |  | ||||||
|     pkhtb   r7, r6, r8                      ; src[1] | src[2] |  | ||||||
|     pkhtb   r9, r8, r10                     ; src[3] | src[4] |  | ||||||
|  |  | ||||||
|     smuad   r6, r6, r5                      ; apply filter |  | ||||||
|     smuad   r8, r8, r5                      ; apply filter |  | ||||||
|  |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|  |  | ||||||
|     smuadx  r7, r7, r5                      ; apply filter |  | ||||||
|     smuadx  r9, r9, r5                      ; apply filter |  | ||||||
|  |  | ||||||
|     add     r0, r0, #8 |  | ||||||
|  |  | ||||||
|     add     r6, r6, #0x40                   ; round_shift_and_clamp |  | ||||||
|     add     r7, r7, #0x40 |  | ||||||
|     usat    r6, #8, r6, asr #7 |  | ||||||
|     usat    r7, #8, r7, asr #7 |  | ||||||
|     strb    r6, [r1], r2                    ; the result is transposed back and stored |  | ||||||
|  |  | ||||||
|     add     r8, r8, #0x40                   ; round_shift_and_clamp |  | ||||||
|     strb    r7, [r1], r2 |  | ||||||
|     add     r9, r9, #0x40 |  | ||||||
|     usat    r8, #8, r8, asr #7 |  | ||||||
|     usat    r9, #8, r9, asr #7 |  | ||||||
|     strb    r8, [r1], r2                    ; the result is transposed back and stored |  | ||||||
|  |  | ||||||
|     ldrne   r6, [r0]                        ; load data |  | ||||||
|     strb    r9, [r1], r2 |  | ||||||
|     ldrne   r8, [r0, #4] |  | ||||||
|     ldrneh  r10, [r0, #8] |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_2nd |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, #4                      ; update src for next row |  | ||||||
|     add     r11, r11, #1 |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_2nd |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
| |bil_null_2nd_filter| |  | ||||||
| |bil_height_loop_null_2nd| |  | ||||||
|     mov     lr, r3, lsr #2 |  | ||||||
|  |  | ||||||
| |bil_width_loop_null_2nd| |  | ||||||
|     ldr     r6, [r0], #4                    ; load data |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|     ldr     r8, [r0], #4 |  | ||||||
|  |  | ||||||
|     strb    r6, [r1], r2                    ; store data |  | ||||||
|     mov     r7, r6, lsr #16 |  | ||||||
|     strb    r7, [r1], r2 |  | ||||||
|     mov     r9, r8, lsr #16 |  | ||||||
|     strb    r8, [r1], r2 |  | ||||||
|     strb    r9, [r1], r2 |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_null_2nd |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     add     r11, r11, #1 |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_null_2nd |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|     ENDP  ; |vp8_filter_block2d_second_pass_armv6| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,186 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_copy_mem16x16_v6| |  | ||||||
|     ; ARM |  | ||||||
|     ; REQUIRE8 |  | ||||||
|     ; PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA    Block, CODE, READONLY ; name this block of code |  | ||||||
| ;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |  | ||||||
| |vp8_copy_mem16x16_v6| PROC |  | ||||||
|     stmdb       sp!, {r4 - r7} |  | ||||||
|     ;push   {r4-r7} |  | ||||||
|  |  | ||||||
|     ;preload |  | ||||||
|     pld     [r0, #31]                ; preload for next 16x16 block |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #15 |  | ||||||
|     beq     copy_mem16x16_fast |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #7 |  | ||||||
|     beq     copy_mem16x16_8 |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #3 |  | ||||||
|     beq     copy_mem16x16_4 |  | ||||||
|  |  | ||||||
|     ;copy one byte each time |  | ||||||
|     ldrb    r4, [r0] |  | ||||||
|     ldrb    r5, [r0, #1] |  | ||||||
|     ldrb    r6, [r0, #2] |  | ||||||
|     ldrb    r7, [r0, #3] |  | ||||||
|  |  | ||||||
|     mov     r12, #16 |  | ||||||
|  |  | ||||||
| copy_mem16x16_1_loop |  | ||||||
|     strb    r4, [r2] |  | ||||||
|     strb    r5, [r2, #1] |  | ||||||
|     strb    r6, [r2, #2] |  | ||||||
|     strb    r7, [r2, #3] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #4] |  | ||||||
|     ldrb    r5, [r0, #5] |  | ||||||
|     ldrb    r6, [r0, #6] |  | ||||||
|     ldrb    r7, [r0, #7] |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #4] |  | ||||||
|     strb    r5, [r2, #5] |  | ||||||
|     strb    r6, [r2, #6] |  | ||||||
|     strb    r7, [r2, #7] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #8] |  | ||||||
|     ldrb    r5, [r0, #9] |  | ||||||
|     ldrb    r6, [r0, #10] |  | ||||||
|     ldrb    r7, [r0, #11] |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #8] |  | ||||||
|     strb    r5, [r2, #9] |  | ||||||
|     strb    r6, [r2, #10] |  | ||||||
|     strb    r7, [r2, #11] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #12] |  | ||||||
|     ldrb    r5, [r0, #13] |  | ||||||
|     ldrb    r6, [r0, #14] |  | ||||||
|     ldrb    r7, [r0, #15] |  | ||||||
|  |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #12] |  | ||||||
|     strb    r5, [r2, #13] |  | ||||||
|     strb    r6, [r2, #14] |  | ||||||
|     strb    r7, [r2, #15] |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     ldrneb  r4, [r0] |  | ||||||
|     ldrneb  r5, [r0, #1] |  | ||||||
|     ldrneb  r6, [r0, #2] |  | ||||||
|     ldrneb  r7, [r0, #3] |  | ||||||
|  |  | ||||||
|     pld     [r0, #31]               ; preload for next 16x16 block |  | ||||||
|  |  | ||||||
|     bne     copy_mem16x16_1_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r7} |  | ||||||
|     ;pop        {r4-r7} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 4 bytes each time |  | ||||||
| copy_mem16x16_4 |  | ||||||
|     ldr     r4, [r0] |  | ||||||
|     ldr     r5, [r0, #4] |  | ||||||
|     ldr     r6, [r0, #8] |  | ||||||
|     ldr     r7, [r0, #12] |  | ||||||
|  |  | ||||||
|     mov     r12, #16 |  | ||||||
|  |  | ||||||
| copy_mem16x16_4_loop |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     str     r4, [r2] |  | ||||||
|     str     r5, [r2, #4] |  | ||||||
|     str     r6, [r2, #8] |  | ||||||
|     str     r7, [r2, #12] |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     ldrne   r4, [r0] |  | ||||||
|     ldrne   r5, [r0, #4] |  | ||||||
|     ldrne   r6, [r0, #8] |  | ||||||
|     ldrne   r7, [r0, #12] |  | ||||||
|  |  | ||||||
|     pld     [r0, #31]               ; preload for next 16x16 block |  | ||||||
|  |  | ||||||
|     bne     copy_mem16x16_4_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r7} |  | ||||||
|     ;pop        {r4-r7} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 8 bytes each time |  | ||||||
| copy_mem16x16_8 |  | ||||||
|     sub     r1, r1, #16 |  | ||||||
|     sub     r3, r3, #16 |  | ||||||
|  |  | ||||||
|     mov     r12, #16 |  | ||||||
|  |  | ||||||
| copy_mem16x16_8_loop |  | ||||||
|     ldmia   r0!, {r4-r5} |  | ||||||
|     ;ldm        r0, {r4-r5} |  | ||||||
|     ldmia   r0!, {r6-r7} |  | ||||||
|  |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     stmia   r2!, {r4-r5} |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     ;stm        r2, {r4-r5} |  | ||||||
|     stmia   r2!, {r6-r7} |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     pld     [r0, #31]               ; preload for next 16x16 block |  | ||||||
|     bne     copy_mem16x16_8_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r7} |  | ||||||
|     ;pop        {r4-r7} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 16 bytes each time |  | ||||||
| copy_mem16x16_fast |  | ||||||
|     ;sub        r1, r1, #16 |  | ||||||
|     ;sub        r3, r3, #16 |  | ||||||
|  |  | ||||||
|     mov     r12, #16 |  | ||||||
|  |  | ||||||
| copy_mem16x16_fast_loop |  | ||||||
|     ldmia   r0, {r4-r7} |  | ||||||
|     ;ldm        r0, {r4-r7} |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     stmia   r2, {r4-r7} |  | ||||||
|     ;stm        r2, {r4-r7} |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     pld     [r0, #31]               ; preload for next 16x16 block |  | ||||||
|     bne     copy_mem16x16_fast_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r7} |  | ||||||
|     ;pop        {r4-r7} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
|     ENDP  ; |vp8_copy_mem16x16_v6| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,128 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_copy_mem8x4_v6| |  | ||||||
|     ; ARM |  | ||||||
|     ; REQUIRE8 |  | ||||||
|     ; PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA    Block, CODE, READONLY ; name this block of code |  | ||||||
| ;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |  | ||||||
| |vp8_copy_mem8x4_v6| PROC |  | ||||||
|     ;push   {r4-r5} |  | ||||||
|     stmdb  sp!, {r4-r5} |  | ||||||
|  |  | ||||||
|     ;preload |  | ||||||
|     pld     [r0] |  | ||||||
|     pld     [r0, r1] |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #7 |  | ||||||
|     beq     copy_mem8x4_fast |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #3 |  | ||||||
|     beq     copy_mem8x4_4 |  | ||||||
|  |  | ||||||
|     ;copy 1 byte each time |  | ||||||
|     ldrb    r4, [r0] |  | ||||||
|     ldrb    r5, [r0, #1] |  | ||||||
|  |  | ||||||
|     mov     r12, #4 |  | ||||||
|  |  | ||||||
| copy_mem8x4_1_loop |  | ||||||
|     strb    r4, [r2] |  | ||||||
|     strb    r5, [r2, #1] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #2] |  | ||||||
|     ldrb    r5, [r0, #3] |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #2] |  | ||||||
|     strb    r5, [r2, #3] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #4] |  | ||||||
|     ldrb    r5, [r0, #5] |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #4] |  | ||||||
|     strb    r5, [r2, #5] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #6] |  | ||||||
|     ldrb    r5, [r0, #7] |  | ||||||
|  |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #6] |  | ||||||
|     strb    r5, [r2, #7] |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     ldrneb  r4, [r0] |  | ||||||
|     ldrneb  r5, [r0, #1] |  | ||||||
|  |  | ||||||
|     bne     copy_mem8x4_1_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r5} |  | ||||||
|     ;pop        {r4-r5} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 4 bytes each time |  | ||||||
| copy_mem8x4_4 |  | ||||||
|     ldr     r4, [r0] |  | ||||||
|     ldr     r5, [r0, #4] |  | ||||||
|  |  | ||||||
|     mov     r12, #4 |  | ||||||
|  |  | ||||||
| copy_mem8x4_4_loop |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     str     r4, [r2] |  | ||||||
|     str     r5, [r2, #4] |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     ldrne   r4, [r0] |  | ||||||
|     ldrne   r5, [r0, #4] |  | ||||||
|  |  | ||||||
|     bne     copy_mem8x4_4_loop |  | ||||||
|  |  | ||||||
|     ldmia  sp!, {r4-r5} |  | ||||||
|     ;pop        {r4-r5} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 8 bytes each time |  | ||||||
| copy_mem8x4_fast |  | ||||||
|     ;sub        r1, r1, #8 |  | ||||||
|     ;sub        r3, r3, #8 |  | ||||||
|  |  | ||||||
|     mov     r12, #4 |  | ||||||
|  |  | ||||||
| copy_mem8x4_fast_loop |  | ||||||
|     ldmia   r0, {r4-r5} |  | ||||||
|     ;ldm        r0, {r4-r5} |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     stmia   r2, {r4-r5} |  | ||||||
|     ;stm        r2, {r4-r5} |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     bne     copy_mem8x4_fast_loop |  | ||||||
|  |  | ||||||
|     ldmia  sp!, {r4-r5} |  | ||||||
|     ;pop        {r4-r5} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
|     ENDP  ; |vp8_copy_mem8x4_v6| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,128 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_copy_mem8x8_v6| |  | ||||||
|     ; ARM |  | ||||||
|     ; REQUIRE8 |  | ||||||
|     ; PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA    Block, CODE, READONLY ; name this block of code |  | ||||||
| ;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= |  | ||||||
| |vp8_copy_mem8x8_v6| PROC |  | ||||||
|     ;push   {r4-r5} |  | ||||||
|     stmdb  sp!, {r4-r5} |  | ||||||
|  |  | ||||||
|     ;preload |  | ||||||
|     pld     [r0] |  | ||||||
|     pld     [r0, r1] |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #7 |  | ||||||
|     beq     copy_mem8x8_fast |  | ||||||
|  |  | ||||||
|     ands    r4, r0, #3 |  | ||||||
|     beq     copy_mem8x8_4 |  | ||||||
|  |  | ||||||
|     ;copy 1 byte each time |  | ||||||
|     ldrb    r4, [r0] |  | ||||||
|     ldrb    r5, [r0, #1] |  | ||||||
|  |  | ||||||
|     mov     r12, #8 |  | ||||||
|  |  | ||||||
| copy_mem8x8_1_loop |  | ||||||
|     strb    r4, [r2] |  | ||||||
|     strb    r5, [r2, #1] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #2] |  | ||||||
|     ldrb    r5, [r0, #3] |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #2] |  | ||||||
|     strb    r5, [r2, #3] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #4] |  | ||||||
|     ldrb    r5, [r0, #5] |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #4] |  | ||||||
|     strb    r5, [r2, #5] |  | ||||||
|  |  | ||||||
|     ldrb    r4, [r0, #6] |  | ||||||
|     ldrb    r5, [r0, #7] |  | ||||||
|  |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     strb    r4, [r2, #6] |  | ||||||
|     strb    r5, [r2, #7] |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     ldrneb  r4, [r0] |  | ||||||
|     ldrneb  r5, [r0, #1] |  | ||||||
|  |  | ||||||
|     bne     copy_mem8x8_1_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r5} |  | ||||||
|     ;pop        {r4-r5} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 4 bytes each time |  | ||||||
| copy_mem8x8_4 |  | ||||||
|     ldr     r4, [r0] |  | ||||||
|     ldr     r5, [r0, #4] |  | ||||||
|  |  | ||||||
|     mov     r12, #8 |  | ||||||
|  |  | ||||||
| copy_mem8x8_4_loop |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     str     r4, [r2] |  | ||||||
|     str     r5, [r2, #4] |  | ||||||
|  |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     ldrne   r4, [r0] |  | ||||||
|     ldrne   r5, [r0, #4] |  | ||||||
|  |  | ||||||
|     bne     copy_mem8x8_4_loop |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r5} |  | ||||||
|     ;pop        {r4-r5} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
| ;copy 8 bytes each time |  | ||||||
| copy_mem8x8_fast |  | ||||||
|     ;sub        r1, r1, #8 |  | ||||||
|     ;sub        r3, r3, #8 |  | ||||||
|  |  | ||||||
|     mov     r12, #8 |  | ||||||
|  |  | ||||||
| copy_mem8x8_fast_loop |  | ||||||
|     ldmia   r0, {r4-r5} |  | ||||||
|     ;ldm        r0, {r4-r5} |  | ||||||
|     add     r0, r0, r1 |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     stmia   r2, {r4-r5} |  | ||||||
|     ;stm        r2, {r4-r5} |  | ||||||
|     add     r2, r2, r3 |  | ||||||
|  |  | ||||||
|     bne     copy_mem8x8_fast_loop |  | ||||||
|  |  | ||||||
|     ldmia  sp!, {r4-r5} |  | ||||||
|     ;pop        {r4-r5} |  | ||||||
|     mov     pc, lr |  | ||||||
|  |  | ||||||
|     ENDP  ; |vp8_copy_mem8x8_v6| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,70 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license and patent |  | ||||||
| ;  grant that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. All contributing project authors may be found in the AUTHORS |  | ||||||
| ;  file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_dc_only_idct_add_v6| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY |  | ||||||
|  |  | ||||||
| ;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, |  | ||||||
| ;                            int pred_stride, unsigned char *dst_ptr, |  | ||||||
| ;                            int dst_stride) |  | ||||||
| ; r0  input_dc |  | ||||||
| ; r1  pred_ptr |  | ||||||
| ; r2  pred_stride |  | ||||||
| ; r3  dst_ptr |  | ||||||
| ; sp  dst_stride |  | ||||||
|  |  | ||||||
| |vp8_dc_only_idct_add_v6| PROC |  | ||||||
|     stmdb       sp!, {r4 - r7} |  | ||||||
|  |  | ||||||
|     add         r0, r0, #4                ; input_dc += 4 |  | ||||||
|     ldr         r12, c0x0000FFFF |  | ||||||
|     ldr         r4, [r1], r2 |  | ||||||
|     and         r0, r12, r0, asr #3       ; input_dc >> 3 + mask |  | ||||||
|     ldr         r6, [r1], r2 |  | ||||||
|     orr         r0, r0, r0, lsl #16       ; a1 | a1 |  | ||||||
|  |  | ||||||
|     ldr         r12, [sp, #16]            ; dst stride |  | ||||||
|  |  | ||||||
|     uxtab16     r5, r0, r4                ; a1+2 | a1+0 |  | ||||||
|     uxtab16     r4, r0, r4, ror #8        ; a1+3 | a1+1 |  | ||||||
|     uxtab16     r7, r0, r6 |  | ||||||
|     uxtab16     r6, r0, r6, ror #8 |  | ||||||
|     usat16      r5, #8, r5 |  | ||||||
|     usat16      r4, #8, r4 |  | ||||||
|     usat16      r7, #8, r7 |  | ||||||
|     usat16      r6, #8, r6 |  | ||||||
|     orr         r5, r5, r4, lsl #8 |  | ||||||
|     orr         r7, r7, r6, lsl #8 |  | ||||||
|     ldr         r4, [r1], r2 |  | ||||||
|     str         r5, [r3], r12 |  | ||||||
|     ldr         r6, [r1] |  | ||||||
|     str         r7, [r3], r12 |  | ||||||
|  |  | ||||||
|     uxtab16     r5, r0, r4 |  | ||||||
|     uxtab16     r4, r0, r4, ror #8 |  | ||||||
|     uxtab16     r7, r0, r6 |  | ||||||
|     uxtab16     r6, r0, r6, ror #8 |  | ||||||
|     usat16      r5, #8, r5 |  | ||||||
|     usat16      r4, #8, r4 |  | ||||||
|     usat16      r7, #8, r7 |  | ||||||
|     usat16      r6, #8, r6 |  | ||||||
|     orr         r5, r5, r4, lsl #8 |  | ||||||
|     orr         r7, r7, r6, lsl #8 |  | ||||||
|     str         r5, [r3], r12 |  | ||||||
|     str         r7, [r3] |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r7} |  | ||||||
|     bx          lr |  | ||||||
|  |  | ||||||
|     ENDP  ; |vp8_dc_only_idct_add_v6| |  | ||||||
|  |  | ||||||
| ; Constant Pool |  | ||||||
| c0x0000FFFF DCD 0x0000FFFF |  | ||||||
|     END |  | ||||||
| @@ -1,190 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license and patent |  | ||||||
| ;  grant that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. All contributing project authors may be found in the AUTHORS |  | ||||||
| ;  file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|     EXPORT |vp8_dequant_idct_add_v6| |  | ||||||
|  |  | ||||||
|     AREA |.text|, CODE, READONLY |  | ||||||
| ;void vp8_dequant_idct_v6(short *input, short *dq, |  | ||||||
| ;                         unsigned char *dest, int stride) |  | ||||||
| ; r0 = q |  | ||||||
| ; r1 = dq |  | ||||||
| ; r2 = dst |  | ||||||
| ; r3 = stride |  | ||||||
|  |  | ||||||
| |vp8_dequant_idct_add_v6| PROC |  | ||||||
|     stmdb   sp!, {r4-r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r4, [r0]                ;input |  | ||||||
|     ldr     r5, [r1], #4            ;dq |  | ||||||
|  |  | ||||||
|     sub     sp, sp, #4 |  | ||||||
|     str     r3, [sp] |  | ||||||
|  |  | ||||||
|     mov     r12, #4 |  | ||||||
|  |  | ||||||
| vp8_dequant_add_loop |  | ||||||
|     smulbb  r6, r4, r5 |  | ||||||
|     smultt  r7, r4, r5 |  | ||||||
|  |  | ||||||
|     ldr     r4, [r0, #4]            ;input |  | ||||||
|     ldr     r5, [r1], #4            ;dq |  | ||||||
|  |  | ||||||
|     strh    r6, [r0], #2 |  | ||||||
|     strh    r7, [r0], #2 |  | ||||||
|  |  | ||||||
|     smulbb  r6, r4, r5 |  | ||||||
|     smultt  r7, r4, r5 |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     ldrne   r4, [r0, #4] |  | ||||||
|     ldrne   r5, [r1], #4 |  | ||||||
|  |  | ||||||
|     strh    r6, [r0], #2 |  | ||||||
|     strh    r7, [r0], #2 |  | ||||||
|  |  | ||||||
|     bne     vp8_dequant_add_loop |  | ||||||
|  |  | ||||||
|     sub     r0, r0, #32 |  | ||||||
|     mov     r1, r0 |  | ||||||
|  |  | ||||||
| ; short_idct4x4llm_v6_dual |  | ||||||
|     ldr     r3, cospi8sqrt2minus1 |  | ||||||
|     ldr     r4, sinpi8sqrt2 |  | ||||||
|     ldr     r6, [r0, #8] |  | ||||||
|     mov     r5, #2 |  | ||||||
| vp8_dequant_idct_loop1_v6 |  | ||||||
|     ldr     r12, [r0, #24] |  | ||||||
|     ldr     r14, [r0, #16] |  | ||||||
|     smulwt  r9, r3, r6 |  | ||||||
|     smulwb  r7, r3, r6 |  | ||||||
|     smulwt  r10, r4, r6 |  | ||||||
|     smulwb  r8, r4, r6 |  | ||||||
|     pkhbt   r7, r7, r9, lsl #16 |  | ||||||
|     smulwt  r11, r3, r12 |  | ||||||
|     pkhbt   r8, r8, r10, lsl #16 |  | ||||||
|     uadd16  r6, r6, r7 |  | ||||||
|     smulwt  r7, r4, r12 |  | ||||||
|     smulwb  r9, r3, r12 |  | ||||||
|     smulwb  r10, r4, r12 |  | ||||||
|     subs    r5, r5, #1 |  | ||||||
|     pkhbt   r9, r9, r11, lsl #16 |  | ||||||
|     ldr     r11, [r0], #4 |  | ||||||
|     pkhbt   r10, r10, r7, lsl #16 |  | ||||||
|     uadd16  r7, r12, r9 |  | ||||||
|     usub16  r7, r8, r7 |  | ||||||
|     uadd16  r6, r6, r10 |  | ||||||
|     uadd16  r10, r11, r14 |  | ||||||
|     usub16  r8, r11, r14 |  | ||||||
|     uadd16  r9, r10, r6 |  | ||||||
|     usub16  r10, r10, r6 |  | ||||||
|     uadd16  r6, r8, r7 |  | ||||||
|     usub16  r7, r8, r7 |  | ||||||
|     str     r6, [r1, #8] |  | ||||||
|     ldrne   r6, [r0, #8] |  | ||||||
|     str     r7, [r1, #16] |  | ||||||
|     str     r10, [r1, #24] |  | ||||||
|     str     r9, [r1], #4 |  | ||||||
|     bne     vp8_dequant_idct_loop1_v6 |  | ||||||
|  |  | ||||||
|     mov     r5, #2 |  | ||||||
|     sub     r0, r1, #8 |  | ||||||
| vp8_dequant_idct_loop2_v6 |  | ||||||
|     ldr     r6, [r0], #4 |  | ||||||
|     ldr     r7, [r0], #4 |  | ||||||
|     ldr     r8, [r0], #4 |  | ||||||
|     ldr     r9, [r0], #4 |  | ||||||
|     smulwt  r1, r3, r6 |  | ||||||
|     smulwt  r12, r4, r6 |  | ||||||
|     smulwt  lr, r3, r8 |  | ||||||
|     smulwt  r10, r4, r8 |  | ||||||
|     pkhbt   r11, r8, r6, lsl #16 |  | ||||||
|     pkhbt   r1, lr, r1, lsl #16 |  | ||||||
|     pkhbt   r12, r10, r12, lsl #16 |  | ||||||
|     pkhtb   r6, r6, r8, asr #16 |  | ||||||
|     uadd16  r6, r1, r6 |  | ||||||
|     pkhbt   lr, r9, r7, lsl #16 |  | ||||||
|     uadd16  r10, r11, lr |  | ||||||
|     usub16  lr, r11, lr |  | ||||||
|     pkhtb   r8, r7, r9, asr #16 |  | ||||||
|     subs    r5, r5, #1 |  | ||||||
|     smulwt  r1, r3, r8 |  | ||||||
|     smulwb  r7, r3, r8 |  | ||||||
|     smulwt  r11, r4, r8 |  | ||||||
|     smulwb  r9, r4, r8 |  | ||||||
|     pkhbt   r1, r7, r1, lsl #16 |  | ||||||
|     uadd16  r8, r1, r8 |  | ||||||
|     pkhbt   r11, r9, r11, lsl #16 |  | ||||||
|     usub16  r1, r12, r8 |  | ||||||
|     uadd16  r8, r11, r6 |  | ||||||
|     ldr     r9, c0x00040004 |  | ||||||
|     ldr     r12, [sp]               ; get stride from stack |  | ||||||
|     uadd16  r6, r10, r8 |  | ||||||
|     usub16  r7, r10, r8 |  | ||||||
|     uadd16  r7, r7, r9 |  | ||||||
|     uadd16  r6, r6, r9 |  | ||||||
|     uadd16  r10, r14, r1 |  | ||||||
|     usub16  r1, r14, r1 |  | ||||||
|     uadd16  r10, r10, r9 |  | ||||||
|     uadd16  r1, r1, r9 |  | ||||||
|     ldr     r11, [r2]               ; load input from dst |  | ||||||
|     mov     r8, r7, asr #3 |  | ||||||
|     pkhtb   r9, r8, r10, asr #19 |  | ||||||
|     mov     r8, r1, asr #3 |  | ||||||
|     pkhtb   r8, r8, r6, asr #19 |  | ||||||
|     uxtb16  lr, r11, ror #8 |  | ||||||
|     qadd16  r9, r9, lr |  | ||||||
|     uxtb16  lr, r11 |  | ||||||
|     qadd16  r8, r8, lr |  | ||||||
|     usat16  r9, #8, r9 |  | ||||||
|     usat16  r8, #8, r8 |  | ||||||
|     orr     r9, r8, r9, lsl #8 |  | ||||||
|     ldr     r11, [r2, r12]          ; load input from dst |  | ||||||
|     mov     r7, r7, lsl #16 |  | ||||||
|     mov     r1, r1, lsl #16 |  | ||||||
|     mov     r10, r10, lsl #16 |  | ||||||
|     mov     r6, r6, lsl #16 |  | ||||||
|     mov     r7, r7, asr #3 |  | ||||||
|     pkhtb   r7, r7, r10, asr #19 |  | ||||||
|     mov     r1, r1, asr #3 |  | ||||||
|     pkhtb   r1, r1, r6, asr #19 |  | ||||||
|     uxtb16  r8, r11, ror #8 |  | ||||||
|     qadd16  r7, r7, r8 |  | ||||||
|     uxtb16  r8, r11 |  | ||||||
|     qadd16  r1, r1, r8 |  | ||||||
|     usat16  r7, #8, r7 |  | ||||||
|     usat16  r1, #8, r1 |  | ||||||
|     orr     r1, r1, r7, lsl #8 |  | ||||||
|     str     r9, [r2], r12           ; store output to dst |  | ||||||
|     str     r1, [r2], r12           ; store output to dst |  | ||||||
|     bne     vp8_dequant_idct_loop2_v6 |  | ||||||
|  |  | ||||||
| ; memset |  | ||||||
|     sub     r0, r0, #32 |  | ||||||
|     add     sp, sp, #4 |  | ||||||
|  |  | ||||||
|     mov     r12, #0 |  | ||||||
|     str     r12, [r0] |  | ||||||
|     str     r12, [r0, #4] |  | ||||||
|     str     r12, [r0, #8] |  | ||||||
|     str     r12, [r0, #12] |  | ||||||
|     str     r12, [r0, #16] |  | ||||||
|     str     r12, [r0, #20] |  | ||||||
|     str     r12, [r0, #24] |  | ||||||
|     str     r12, [r0, #28] |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|     ENDP    ; |vp8_dequant_idct_add_v6| |  | ||||||
|  |  | ||||||
| ; Constant Pool |  | ||||||
| cospi8sqrt2minus1 DCD 0x00004E7B |  | ||||||
| sinpi8sqrt2       DCD 0x00008A8C |  | ||||||
| c0x00040004       DCD 0x00040004 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,69 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_dequantize_b_loop_v6| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
| ;------------------------------- |  | ||||||
| ;void   vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); |  | ||||||
| ; r0    short *Q, |  | ||||||
| ; r1    short *DQC |  | ||||||
| ; r2    short *DQ |  | ||||||
| |vp8_dequantize_b_loop_v6| PROC |  | ||||||
|     stmdb   sp!, {r4-r9, lr} |  | ||||||
|  |  | ||||||
|     ldr     r3, [r0]                ;load Q |  | ||||||
|     ldr     r4, [r1]                ;load DQC |  | ||||||
|     ldr     r5, [r0, #4] |  | ||||||
|     ldr     r6, [r1, #4] |  | ||||||
|  |  | ||||||
|     mov     r12, #2                 ;loop counter |  | ||||||
|  |  | ||||||
| dequant_loop |  | ||||||
|     smulbb  r7, r3, r4              ;multiply |  | ||||||
|     smultt  r8, r3, r4 |  | ||||||
|     smulbb  r9, r5, r6 |  | ||||||
|     smultt  lr, r5, r6 |  | ||||||
|  |  | ||||||
|     ldr     r3, [r0, #8] |  | ||||||
|     ldr     r4, [r1, #8] |  | ||||||
|     ldr     r5, [r0, #12] |  | ||||||
|     ldr     r6, [r1, #12] |  | ||||||
|  |  | ||||||
|     strh    r7, [r2], #2            ;store result |  | ||||||
|     smulbb  r7, r3, r4              ;multiply |  | ||||||
|     strh    r8, [r2], #2 |  | ||||||
|     smultt  r8, r3, r4 |  | ||||||
|     strh    r9, [r2], #2 |  | ||||||
|     smulbb  r9, r5, r6 |  | ||||||
|     strh    lr, [r2], #2 |  | ||||||
|     smultt  lr, r5, r6 |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     add     r0, r0, #16 |  | ||||||
|     add     r1, r1, #16 |  | ||||||
|  |  | ||||||
|     ldrne       r3, [r0] |  | ||||||
|     strh    r7, [r2], #2            ;store result |  | ||||||
|     ldrne       r4, [r1] |  | ||||||
|     strh    r8, [r2], #2 |  | ||||||
|     ldrne       r5, [r0, #4] |  | ||||||
|     strh    r9, [r2], #2 |  | ||||||
|     ldrne       r6, [r1, #4] |  | ||||||
|     strh    lr, [r2], #2 |  | ||||||
|  |  | ||||||
|     bne     dequant_loop |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4-r9, pc} |  | ||||||
|     ENDP    ;|vp8_dequantize_b_loop_v6| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,624 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_filter_block2d_first_pass_armv6| |  | ||||||
|     EXPORT  |vp8_filter_block2d_first_pass_16x16_armv6| |  | ||||||
|     EXPORT  |vp8_filter_block2d_first_pass_8x8_armv6| |  | ||||||
|     EXPORT  |vp8_filter_block2d_second_pass_armv6| |  | ||||||
|     EXPORT  |vp8_filter4_block2d_second_pass_armv6| |  | ||||||
|     EXPORT  |vp8_filter_block2d_first_pass_only_armv6| |  | ||||||
|     EXPORT  |vp8_filter_block2d_second_pass_only_armv6| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
| ;------------------------------------- |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    short         *output_ptr |  | ||||||
| ; r2    unsigned int src_pixels_per_line |  | ||||||
| ; r3    unsigned int output_width |  | ||||||
| ; stack unsigned int output_height |  | ||||||
| ; stack const short *vp8_filter |  | ||||||
| ;------------------------------------- |  | ||||||
| ; vp8_filter the input and put in the output array.  Apply the 6 tap FIR filter with |  | ||||||
| ; the output being a 2 byte value and the intput being a 1 byte value. |  | ||||||
| |vp8_filter_block2d_first_pass_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vp8_filter address |  | ||||||
|     ldr     r7, [sp, #36]                   ; output height |  | ||||||
|  |  | ||||||
|     sub     r2, r2, r3                      ; inside loop increments input array, |  | ||||||
|                                             ; so the height loop only needs to add |  | ||||||
|                                             ; r2 - width to the input pointer |  | ||||||
|  |  | ||||||
|     mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts |  | ||||||
|     add     r12, r3, #16                    ; square off the output |  | ||||||
|     sub     sp, sp, #4 |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
|     str     r1, [sp]                        ; push destination to stack |  | ||||||
|     mov     r7, r7, lsl #16                 ; height is top part of counter |  | ||||||
|  |  | ||||||
| ; six tap filter |  | ||||||
| |height_loop_1st_6| |  | ||||||
|     ldrb    r8, [r0, #-2]                   ; load source data |  | ||||||
|     ldrb    r9, [r0, #-1] |  | ||||||
|     ldrb    r10, [r0], #2 |  | ||||||
|     orr     r7, r7, r3, lsr #2              ; construct loop counter |  | ||||||
|  |  | ||||||
| |width_loop_1st_6| |  | ||||||
|     ldrb    r11, [r0, #-1] |  | ||||||
|  |  | ||||||
|     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 |  | ||||||
|     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|  |  | ||||||
|     ldrb    r9, [r0] |  | ||||||
|  |  | ||||||
|     smuad   lr, lr, r4                      ; apply the filter |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|     smuad   r8, r8, r4 |  | ||||||
|     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 |  | ||||||
|  |  | ||||||
|     smlad   lr, r10, r5, lr |  | ||||||
|     ldrb    r10, [r0, #1] |  | ||||||
|     smlad   r8, r11, r5, r8 |  | ||||||
|     ldrb    r11, [r0, #2] |  | ||||||
|  |  | ||||||
|     sub     r7, r7, #1 |  | ||||||
|  |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|  |  | ||||||
|     smlad   lr, r9, r6, lr |  | ||||||
|     smlad   r11, r10, r6, r8 |  | ||||||
|  |  | ||||||
|     ands    r10, r7, #0xff                  ; test loop counter |  | ||||||
|  |  | ||||||
|     add     lr, lr, #0x40                   ; round_shift_and_clamp |  | ||||||
|     ldrneb  r8, [r0, #-2]                   ; load data for next loop |  | ||||||
|     usat    lr, #8, lr, asr #7 |  | ||||||
|     add     r11, r11, #0x40 |  | ||||||
|     ldrneb  r9, [r0, #-1] |  | ||||||
|     usat    r11, #8, r11, asr #7 |  | ||||||
|  |  | ||||||
|     strh    lr, [r1], r12                   ; result is transposed and stored, which |  | ||||||
|                                             ; will make second pass filtering easier. |  | ||||||
|     ldrneb  r10, [r0], #2 |  | ||||||
|     strh    r11, [r1], r12 |  | ||||||
|  |  | ||||||
|     bne     width_loop_1st_6 |  | ||||||
|  |  | ||||||
|     ldr     r1, [sp]                        ; load and update dst address |  | ||||||
|     subs    r7, r7, #0x10000 |  | ||||||
|     add     r0, r0, r2                      ; move to next input line |  | ||||||
|  |  | ||||||
|     add     r1, r1, #2                      ; move over to next column |  | ||||||
|     str     r1, [sp] |  | ||||||
|  |  | ||||||
|     bne     height_loop_1st_6 |  | ||||||
|  |  | ||||||
|     add     sp, sp, #4 |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ; -------------------------- |  | ||||||
| ; 16x16 version |  | ||||||
| ; ----------------------------- |  | ||||||
| |vp8_filter_block2d_first_pass_16x16_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vp8_filter address |  | ||||||
|     ldr     r7, [sp, #36]                   ; output height |  | ||||||
|  |  | ||||||
|     add     r4, r2, #18                     ; preload next low |  | ||||||
|     pld     [r0, r4] |  | ||||||
|  |  | ||||||
|     sub     r2, r2, r3                      ; inside loop increments input array, |  | ||||||
|                                             ; so the height loop only needs to add |  | ||||||
|                                             ; r2 - width to the input pointer |  | ||||||
|  |  | ||||||
|     mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts |  | ||||||
|     add     r12, r3, #16                    ; square off the output |  | ||||||
|     sub     sp, sp, #4 |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
|     str     r1, [sp]                        ; push destination to stack |  | ||||||
|     mov     r7, r7, lsl #16                 ; height is top part of counter |  | ||||||
|  |  | ||||||
| ; six tap filter |  | ||||||
| |height_loop_1st_16_6| |  | ||||||
|     ldrb    r8, [r0, #-2]                   ; load source data |  | ||||||
|     ldrb    r9, [r0, #-1] |  | ||||||
|     ldrb    r10, [r0], #2 |  | ||||||
|     orr     r7, r7, r3, lsr #2              ; construct loop counter |  | ||||||
|  |  | ||||||
| |width_loop_1st_16_6| |  | ||||||
|     ldrb    r11, [r0, #-1] |  | ||||||
|  |  | ||||||
|     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 |  | ||||||
|     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|  |  | ||||||
|     ldrb    r9, [r0] |  | ||||||
|  |  | ||||||
|     smuad   lr, lr, r4                      ; apply the filter |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|     smuad   r8, r8, r4 |  | ||||||
|     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 |  | ||||||
|  |  | ||||||
|     smlad   lr, r10, r5, lr |  | ||||||
|     ldrb    r10, [r0, #1] |  | ||||||
|     smlad   r8, r11, r5, r8 |  | ||||||
|     ldrb    r11, [r0, #2] |  | ||||||
|  |  | ||||||
|     sub     r7, r7, #1 |  | ||||||
|  |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|  |  | ||||||
|     smlad   lr, r9, r6, lr |  | ||||||
|     smlad   r11, r10, r6, r8 |  | ||||||
|  |  | ||||||
|     ands    r10, r7, #0xff                  ; test loop counter |  | ||||||
|  |  | ||||||
|     add     lr, lr, #0x40                   ; round_shift_and_clamp |  | ||||||
|     ldrneb  r8, [r0, #-2]                   ; load data for next loop |  | ||||||
|     usat    lr, #8, lr, asr #7 |  | ||||||
|     add     r11, r11, #0x40 |  | ||||||
|     ldrneb  r9, [r0, #-1] |  | ||||||
|     usat    r11, #8, r11, asr #7 |  | ||||||
|  |  | ||||||
|     strh    lr, [r1], r12                   ; result is transposed and stored, which |  | ||||||
|                                             ; will make second pass filtering easier. |  | ||||||
|     ldrneb  r10, [r0], #2 |  | ||||||
|     strh    r11, [r1], r12 |  | ||||||
|  |  | ||||||
|     bne     width_loop_1st_16_6 |  | ||||||
|  |  | ||||||
|     ldr     r1, [sp]                        ; load and update dst address |  | ||||||
|     subs    r7, r7, #0x10000 |  | ||||||
|     add     r0, r0, r2                      ; move to next input line |  | ||||||
|  |  | ||||||
|     add     r11, r2, #34                    ; adding back block width(=16) |  | ||||||
|     pld     [r0, r11]                       ; preload next low |  | ||||||
|  |  | ||||||
|     add     r1, r1, #2                      ; move over to next column |  | ||||||
|     str     r1, [sp] |  | ||||||
|  |  | ||||||
|     bne     height_loop_1st_16_6 |  | ||||||
|  |  | ||||||
|     add     sp, sp, #4 |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ; -------------------------- |  | ||||||
| ; 8x8 version |  | ||||||
| ; ----------------------------- |  | ||||||
| |vp8_filter_block2d_first_pass_8x8_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vp8_filter address |  | ||||||
|     ldr     r7, [sp, #36]                   ; output height |  | ||||||
|  |  | ||||||
|     add     r4, r2, #10                     ; preload next low |  | ||||||
|     pld     [r0, r4] |  | ||||||
|  |  | ||||||
|     sub     r2, r2, r3                      ; inside loop increments input array, |  | ||||||
|                                             ; so the height loop only needs to add |  | ||||||
|                                             ; r2 - width to the input pointer |  | ||||||
|  |  | ||||||
|     mov     r3, r3, lsl #1                  ; multiply width by 2 because using shorts |  | ||||||
|     add     r12, r3, #16                    ; square off the output |  | ||||||
|     sub     sp, sp, #4 |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
|     str     r1, [sp]                        ; push destination to stack |  | ||||||
|     mov     r7, r7, lsl #16                 ; height is top part of counter |  | ||||||
|  |  | ||||||
| ; six tap filter |  | ||||||
| |height_loop_1st_8_6| |  | ||||||
|     ldrb    r8, [r0, #-2]                   ; load source data |  | ||||||
|     ldrb    r9, [r0, #-1] |  | ||||||
|     ldrb    r10, [r0], #2 |  | ||||||
|     orr     r7, r7, r3, lsr #2              ; construct loop counter |  | ||||||
|  |  | ||||||
| |width_loop_1st_8_6| |  | ||||||
|     ldrb    r11, [r0, #-1] |  | ||||||
|  |  | ||||||
|     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 |  | ||||||
|     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|  |  | ||||||
|     ldrb    r9, [r0] |  | ||||||
|  |  | ||||||
|     smuad   lr, lr, r4                      ; apply the filter |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|     smuad   r8, r8, r4 |  | ||||||
|     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 |  | ||||||
|  |  | ||||||
|     smlad   lr, r10, r5, lr |  | ||||||
|     ldrb    r10, [r0, #1] |  | ||||||
|     smlad   r8, r11, r5, r8 |  | ||||||
|     ldrb    r11, [r0, #2] |  | ||||||
|  |  | ||||||
|     sub     r7, r7, #1 |  | ||||||
|  |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|  |  | ||||||
|     smlad   lr, r9, r6, lr |  | ||||||
|     smlad   r11, r10, r6, r8 |  | ||||||
|  |  | ||||||
|     ands    r10, r7, #0xff                  ; test loop counter |  | ||||||
|  |  | ||||||
|     add     lr, lr, #0x40                   ; round_shift_and_clamp |  | ||||||
|     ldrneb  r8, [r0, #-2]                   ; load data for next loop |  | ||||||
|     usat    lr, #8, lr, asr #7 |  | ||||||
|     add     r11, r11, #0x40 |  | ||||||
|     ldrneb  r9, [r0, #-1] |  | ||||||
|     usat    r11, #8, r11, asr #7 |  | ||||||
|  |  | ||||||
|     strh    lr, [r1], r12                   ; result is transposed and stored, which |  | ||||||
|                                             ; will make second pass filtering easier. |  | ||||||
|     ldrneb  r10, [r0], #2 |  | ||||||
|     strh    r11, [r1], r12 |  | ||||||
|  |  | ||||||
|     bne     width_loop_1st_8_6 |  | ||||||
|  |  | ||||||
|     ldr     r1, [sp]                        ; load and update dst address |  | ||||||
|     subs    r7, r7, #0x10000 |  | ||||||
|     add     r0, r0, r2                      ; move to next input line |  | ||||||
|  |  | ||||||
|     add     r11, r2, #18                    ; adding back block width(=8) |  | ||||||
|     pld     [r0, r11]                       ; preload next low |  | ||||||
|  |  | ||||||
|     add     r1, r1, #2                      ; move over to next column |  | ||||||
|     str     r1, [sp] |  | ||||||
|  |  | ||||||
|     bne     height_loop_1st_8_6 |  | ||||||
|  |  | ||||||
|     add     sp, sp, #4 |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ;--------------------------------- |  | ||||||
| ; r0    short         *src_ptr, |  | ||||||
| ; r1    unsigned char *output_ptr, |  | ||||||
| ; r2    unsigned int output_pitch, |  | ||||||
| ; r3    unsigned int cnt, |  | ||||||
| ; stack const short *vp8_filter |  | ||||||
| ;--------------------------------- |  | ||||||
| |vp8_filter_block2d_second_pass_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #36]                  ; vp8_filter address |  | ||||||
|     sub     sp, sp, #4 |  | ||||||
|     mov     r7, r3, lsl #16                 ; height is top part of counter |  | ||||||
|     str     r1, [sp]                        ; push destination to stack |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
|     pkhbt   r12, r5, r4                     ; pack the filter differently |  | ||||||
|     pkhbt   r11, r6, r5 |  | ||||||
|  |  | ||||||
|     sub     r0, r0, #4                      ; offset input buffer |  | ||||||
|  |  | ||||||
| |height_loop_2nd| |  | ||||||
|     ldr     r8, [r0]                        ; load the data |  | ||||||
|     ldr     r9, [r0, #4] |  | ||||||
|     orr     r7, r7, r3, lsr #1              ; loop counter |  | ||||||
|  |  | ||||||
| |width_loop_2nd| |  | ||||||
|     smuad   lr, r4, r8                      ; apply filter |  | ||||||
|     sub     r7, r7, #1 |  | ||||||
|     smulbt  r8, r4, r8 |  | ||||||
|  |  | ||||||
|     ldr     r10, [r0, #8] |  | ||||||
|  |  | ||||||
|     smlad   lr, r5, r9, lr |  | ||||||
|     smladx  r8, r12, r9, r8 |  | ||||||
|  |  | ||||||
|     ldrh    r9, [r0, #12] |  | ||||||
|  |  | ||||||
|     smlad   lr, r6, r10, lr |  | ||||||
|     smladx  r8, r11, r10, r8 |  | ||||||
|  |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     smlatb  r10, r6, r9, r8 |  | ||||||
|  |  | ||||||
|     add     lr, lr, #0x40                   ; round_shift_and_clamp |  | ||||||
|     ands    r8, r7, #0xff |  | ||||||
|     usat    lr, #8, lr, asr #7 |  | ||||||
|     add     r10, r10, #0x40 |  | ||||||
|     strb    lr, [r1], r2                    ; the result is transposed back and stored |  | ||||||
|     usat    r10, #8, r10, asr #7 |  | ||||||
|  |  | ||||||
|     ldrne   r8, [r0]                        ; load data for next loop |  | ||||||
|     ldrne   r9, [r0, #4] |  | ||||||
|     strb    r10, [r1], r2 |  | ||||||
|  |  | ||||||
|     bne     width_loop_2nd |  | ||||||
|  |  | ||||||
|     ldr     r1, [sp]                        ; update dst for next loop |  | ||||||
|     subs    r7, r7, #0x10000 |  | ||||||
|     add     r0, r0, #16                     ; updata src for next loop |  | ||||||
|     add     r1, r1, #1 |  | ||||||
|     str     r1, [sp] |  | ||||||
|  |  | ||||||
|     bne     height_loop_2nd |  | ||||||
|  |  | ||||||
|     add     sp, sp, #4 |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ;--------------------------------- |  | ||||||
| ; r0    short         *src_ptr, |  | ||||||
| ; r1    unsigned char *output_ptr, |  | ||||||
| ; r2    unsigned int output_pitch, |  | ||||||
| ; r3    unsigned int cnt, |  | ||||||
| ; stack const short *vp8_filter |  | ||||||
| ;--------------------------------- |  | ||||||
| |vp8_filter4_block2d_second_pass_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #36]                  ; vp8_filter address |  | ||||||
|     mov     r7, r3, lsl #16                 ; height is top part of counter |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     add     lr, r1, r3                      ; save final destination pointer |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
|     pkhbt   r12, r5, r4                     ; pack the filter differently |  | ||||||
|     pkhbt   r11, r6, r5 |  | ||||||
|     mov     r4, #0x40                       ; rounding factor (for smlad{x}) |  | ||||||
|  |  | ||||||
| |height_loop_2nd_4| |  | ||||||
|     ldrd    r8, r9, [r0, #-4]               ; load the data |  | ||||||
|     orr     r7, r7, r3, lsr #1              ; loop counter |  | ||||||
|  |  | ||||||
| |width_loop_2nd_4| |  | ||||||
|     ldr     r10, [r0, #4]! |  | ||||||
|     smladx  r6, r9, r12, r4                 ; apply filter |  | ||||||
|     pkhbt   r8, r9, r8 |  | ||||||
|     smlad   r5, r8, r12, r4 |  | ||||||
|     pkhbt   r8, r10, r9 |  | ||||||
|     smladx  r6, r10, r11, r6 |  | ||||||
|     sub     r7, r7, #1 |  | ||||||
|     smlad   r5, r8, r11, r5 |  | ||||||
|  |  | ||||||
|     mov     r8, r9                          ; shift the data for the next loop |  | ||||||
|     mov     r9, r10 |  | ||||||
|  |  | ||||||
|     usat    r6, #8, r6, asr #7              ; shift and clamp |  | ||||||
|     usat    r5, #8, r5, asr #7 |  | ||||||
|  |  | ||||||
|     strb    r5, [r1], r2                    ; the result is transposed back and stored |  | ||||||
|     tst     r7, #0xff |  | ||||||
|     strb    r6, [r1], r2 |  | ||||||
|  |  | ||||||
|     bne     width_loop_2nd_4 |  | ||||||
|  |  | ||||||
|     subs    r7, r7, #0x10000 |  | ||||||
|     add     r0, r0, #16                     ; update src for next loop |  | ||||||
|     sub     r1, lr, r7, lsr #16             ; update dst for next loop |  | ||||||
|  |  | ||||||
|     bne     height_loop_2nd_4 |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ;------------------------------------ |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    unsigned char *output_ptr, |  | ||||||
| ; r2    unsigned int src_pixels_per_line |  | ||||||
| ; r3    unsigned int cnt, |  | ||||||
| ; stack unsigned int output_pitch, |  | ||||||
| ; stack const short *vp8_filter |  | ||||||
| ;------------------------------------ |  | ||||||
| |vp8_filter_block2d_first_pass_only_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     add     r7, r2, r3                      ; preload next low |  | ||||||
|     add     r7, r7, #2 |  | ||||||
|     pld     [r0, r7] |  | ||||||
|  |  | ||||||
|     ldr     r4, [sp, #36]                   ; output pitch |  | ||||||
|     ldr     r11, [sp, #40]                  ; HFilter address |  | ||||||
|     sub     sp, sp, #8 |  | ||||||
|  |  | ||||||
|     mov     r7, r3 |  | ||||||
|     sub     r2, r2, r3                      ; inside loop increments input array, |  | ||||||
|                                             ; so the height loop only needs to add |  | ||||||
|                                             ; r2 - width to the input pointer |  | ||||||
|  |  | ||||||
|     sub     r4, r4, r3 |  | ||||||
|     str     r4, [sp]                        ; save modified output pitch |  | ||||||
|     str     r2, [sp, #4] |  | ||||||
|  |  | ||||||
|     mov     r2, #0x40 |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
| ; six tap filter |  | ||||||
| |height_loop_1st_only_6| |  | ||||||
|     ldrb    r8, [r0, #-2]                   ; load data |  | ||||||
|     ldrb    r9, [r0, #-1] |  | ||||||
|     ldrb    r10, [r0], #2 |  | ||||||
|  |  | ||||||
|     mov     r12, r3, lsr #1                 ; loop counter |  | ||||||
|  |  | ||||||
| |width_loop_1st_only_6| |  | ||||||
|     ldrb    r11, [r0, #-1] |  | ||||||
|  |  | ||||||
|     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 |  | ||||||
|     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|  |  | ||||||
|     ldrb    r9, [r0] |  | ||||||
|  |  | ||||||
| ;;  smuad   lr, lr, r4 |  | ||||||
|     smlad   lr, lr, r4, r2 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
| ;;  smuad   r8, r8, r4 |  | ||||||
|     smlad   r8, r8, r4, r2 |  | ||||||
|     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 |  | ||||||
|  |  | ||||||
|     smlad   lr, r10, r5, lr |  | ||||||
|     ldrb    r10, [r0, #1] |  | ||||||
|     smlad   r8, r11, r5, r8 |  | ||||||
|     ldrb    r11, [r0, #2] |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|  |  | ||||||
|     smlad   lr, r9, r6, lr |  | ||||||
|     smlad   r10, r10, r6, r8 |  | ||||||
|  |  | ||||||
| ;;  add     lr, lr, #0x40                   ; round_shift_and_clamp |  | ||||||
|     ldrneb  r8, [r0, #-2]                   ; load data for next loop |  | ||||||
|     usat    lr, #8, lr, asr #7 |  | ||||||
| ;;  add     r10, r10, #0x40 |  | ||||||
|     strb    lr, [r1], #1                    ; store the result |  | ||||||
|     usat    r10, #8, r10, asr #7 |  | ||||||
|  |  | ||||||
|     ldrneb  r9, [r0, #-1] |  | ||||||
|     strb    r10, [r1], #1 |  | ||||||
|     ldrneb  r10, [r0], #2 |  | ||||||
|  |  | ||||||
|     bne     width_loop_1st_only_6 |  | ||||||
|  |  | ||||||
|     ldr     lr, [sp]                        ; load back output pitch |  | ||||||
|     ldr     r12, [sp, #4]                   ; load back output pitch |  | ||||||
|     subs    r7, r7, #1 |  | ||||||
|     add     r0, r0, r12                     ; updata src for next loop |  | ||||||
|  |  | ||||||
|     add     r11, r12, r3                    ; preload next low |  | ||||||
|     add     r11, r11, #2 |  | ||||||
|     pld     [r0, r11] |  | ||||||
|  |  | ||||||
|     add     r1, r1, lr                      ; update dst for next loop |  | ||||||
|  |  | ||||||
|     bne     height_loop_1st_only_6 |  | ||||||
|  |  | ||||||
|     add     sp, sp, #8 |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|     ENDP  ; |vp8_filter_block2d_first_pass_only_armv6| |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ;------------------------------------ |  | ||||||
| ; r0    unsigned char *src_ptr, |  | ||||||
| ; r1    unsigned char *output_ptr, |  | ||||||
| ; r2    unsigned int src_pixels_per_line |  | ||||||
| ; r3    unsigned int cnt, |  | ||||||
| ; stack unsigned int output_pitch, |  | ||||||
| ; stack const short *vp8_filter |  | ||||||
| ;------------------------------------ |  | ||||||
| |vp8_filter_block2d_second_pass_only_armv6| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; VFilter address |  | ||||||
|     ldr     r12, [sp, #36]                  ; output pitch |  | ||||||
|  |  | ||||||
|     mov     r7, r3, lsl #16                 ; height is top part of counter |  | ||||||
|     sub     r0, r0, r2, lsl #1              ; need 6 elements for filtering, 2 before, 3 after |  | ||||||
|  |  | ||||||
|     sub     sp, sp, #8 |  | ||||||
|  |  | ||||||
|     ldr     r4, [r11]                       ; load up packed filter coefficients |  | ||||||
|     ldr     r5, [r11, #4] |  | ||||||
|     ldr     r6, [r11, #8] |  | ||||||
|  |  | ||||||
|     str     r0, [sp]                        ; save r0 to stack |  | ||||||
|     str     r1, [sp, #4]                    ; save dst to stack |  | ||||||
|  |  | ||||||
| ; six tap filter |  | ||||||
| |width_loop_2nd_only_6| |  | ||||||
|     ldrb    r8, [r0], r2                    ; load data |  | ||||||
|     orr     r7, r7, r3                      ; loop counter |  | ||||||
|     ldrb    r9, [r0], r2 |  | ||||||
|     ldrb    r10, [r0], r2 |  | ||||||
|  |  | ||||||
| |height_loop_2nd_only_6| |  | ||||||
|     ; filter first column in this inner loop, than, move to next colum. |  | ||||||
|     ldrb    r11, [r0], r2 |  | ||||||
|  |  | ||||||
|     pkhbt   lr, r8, r9, lsl #16             ; r9 | r8 |  | ||||||
|     pkhbt   r8, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|  |  | ||||||
|     ldrb    r9, [r0], r2 |  | ||||||
|  |  | ||||||
|     smuad   lr, lr, r4 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|     smuad   r8, r8, r4 |  | ||||||
|     pkhbt   r11, r11, r9, lsl #16           ; r9 | r11 |  | ||||||
|  |  | ||||||
|     smlad   lr, r10, r5, lr |  | ||||||
|     ldrb    r10, [r0], r2 |  | ||||||
|     smlad   r8, r11, r5, r8 |  | ||||||
|     ldrb    r11, [r0] |  | ||||||
|  |  | ||||||
|     sub     r7, r7, #2 |  | ||||||
|     sub     r0, r0, r2, lsl #2 |  | ||||||
|  |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; r10 | r9 |  | ||||||
|     pkhbt   r10, r10, r11, lsl #16          ; r11 | r10 |  | ||||||
|  |  | ||||||
|     smlad   lr, r9, r6, lr |  | ||||||
|     smlad   r10, r10, r6, r8 |  | ||||||
|  |  | ||||||
|     ands    r9, r7, #0xff |  | ||||||
|  |  | ||||||
|     add     lr, lr, #0x40                   ; round_shift_and_clamp |  | ||||||
|     ldrneb  r8, [r0], r2                    ; load data for next loop |  | ||||||
|     usat    lr, #8, lr, asr #7 |  | ||||||
|     add     r10, r10, #0x40 |  | ||||||
|     strb    lr, [r1], r12                   ; store the result for the column |  | ||||||
|     usat    r10, #8, r10, asr #7 |  | ||||||
|  |  | ||||||
|     ldrneb  r9, [r0], r2 |  | ||||||
|     strb    r10, [r1], r12 |  | ||||||
|     ldrneb  r10, [r0], r2 |  | ||||||
|  |  | ||||||
|     bne     height_loop_2nd_only_6 |  | ||||||
|  |  | ||||||
|     ldr     r0, [sp] |  | ||||||
|     ldr     r1, [sp, #4] |  | ||||||
|     subs    r7, r7, #0x10000 |  | ||||||
|     add     r0, r0, #1                      ; move to filter next column |  | ||||||
|     str     r0, [sp] |  | ||||||
|     add     r1, r1, #1 |  | ||||||
|     str     r1, [sp, #4] |  | ||||||
|  |  | ||||||
|     bne     width_loop_2nd_only_6 |  | ||||||
|  |  | ||||||
|     add     sp, sp, #8 |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|     ENDP  ; |vp8_filter_block2d_second_pass_only_armv6| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,100 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include "vpx_config.h" |  | ||||||
| #include "vp8_rtcd.h" |  | ||||||
|  |  | ||||||
| void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, |  | ||||||
|                                      int stride, char *eobs) { |  | ||||||
|   int i; |  | ||||||
|  |  | ||||||
|   for (i = 0; i < 4; ++i) { |  | ||||||
|     if (eobs[0] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q, dq, dst, stride); |  | ||||||
|     else if (eobs[0] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[0] * dq[0], dst, stride, dst, stride); |  | ||||||
|       ((int *)q)[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (eobs[1] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q + 16, dq, dst + 4, stride); |  | ||||||
|     else if (eobs[1] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[16] * dq[0], dst + 4, stride, dst + 4, stride); |  | ||||||
|       ((int *)(q + 16))[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (eobs[2] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q + 32, dq, dst + 8, stride); |  | ||||||
|     else if (eobs[2] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[32] * dq[0], dst + 8, stride, dst + 8, stride); |  | ||||||
|       ((int *)(q + 32))[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (eobs[3] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q + 48, dq, dst + 12, stride); |  | ||||||
|     else if (eobs[3] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[48] * dq[0], dst + 12, stride, dst + 12, |  | ||||||
|                               stride); |  | ||||||
|       ((int *)(q + 48))[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     q += 64; |  | ||||||
|     dst += 4 * stride; |  | ||||||
|     eobs += 4; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dstu, |  | ||||||
|                                       unsigned char *dstv, int stride, |  | ||||||
|                                       char *eobs) { |  | ||||||
|   int i; |  | ||||||
|  |  | ||||||
|   for (i = 0; i < 2; ++i) { |  | ||||||
|     if (eobs[0] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q, dq, dstu, stride); |  | ||||||
|     else if (eobs[0] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[0] * dq[0], dstu, stride, dstu, stride); |  | ||||||
|       ((int *)q)[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (eobs[1] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q + 16, dq, dstu + 4, stride); |  | ||||||
|     else if (eobs[1] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[16] * dq[0], dstu + 4, stride, dstu + 4, |  | ||||||
|                               stride); |  | ||||||
|       ((int *)(q + 16))[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     q += 32; |  | ||||||
|     dstu += 4 * stride; |  | ||||||
|     eobs += 2; |  | ||||||
|   } |  | ||||||
|  |  | ||||||
|   for (i = 0; i < 2; ++i) { |  | ||||||
|     if (eobs[0] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q, dq, dstv, stride); |  | ||||||
|     else if (eobs[0] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[0] * dq[0], dstv, stride, dstv, stride); |  | ||||||
|       ((int *)q)[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     if (eobs[1] > 1) |  | ||||||
|       vp8_dequant_idct_add_v6(q + 16, dq, dstv + 4, stride); |  | ||||||
|     else if (eobs[1] == 1) { |  | ||||||
|       vp8_dc_only_idct_add_v6(q[16] * dq[0], dstv + 4, stride, dstv + 4, |  | ||||||
|                               stride); |  | ||||||
|       ((int *)(q + 16))[0] = 0; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     q += 32; |  | ||||||
|     dstv += 4 * stride; |  | ||||||
|     eobs += 2; |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| @@ -1,202 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_short_idct4x4llm_v6_dual| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, |  | ||||||
| ;                             unsigned char *dst, int stride) |  | ||||||
| ; r0    short* input |  | ||||||
| ; r1    unsigned char* pred |  | ||||||
| ; r2    int pitch |  | ||||||
| ; r3    unsigned char* dst |  | ||||||
| ; sp    int stride |  | ||||||
|  |  | ||||||
| |vp8_short_idct4x4llm_v6_dual| PROC |  | ||||||
|     stmdb   sp!, {r4-r11, lr} |  | ||||||
|  |  | ||||||
|     sub     sp, sp, #4 |  | ||||||
|  |  | ||||||
|     mov     r4, #0x00008A00         ; sin |  | ||||||
|     orr     r4, r4, #0x0000008C     ; sinpi8sqrt2 |  | ||||||
|  |  | ||||||
|     mov     r5, #0x00004E00         ; cos |  | ||||||
|     orr     r5, r5, #0x0000007B     ; cospi8sqrt2minus1 |  | ||||||
|     orr     r5, r5, #1<<31          ; loop counter on top bit |  | ||||||
|  |  | ||||||
| loop1_dual |  | ||||||
|     ldr     r6, [r0, #(4*2)]        ; i5 | i4 |  | ||||||
|     ldr     r12, [r0, #(12*2)]      ; i13|i12 |  | ||||||
|     ldr     r14, [r0, #(8*2)]       ; i9 | i8 |  | ||||||
|  |  | ||||||
|     smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulbb  r7, r5, r6              ; (ip[4] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16 |  | ||||||
|     smulwb  r8, r4, r6              ; (ip[4] * sinpi8sqrt2) >> 16 |  | ||||||
|  |  | ||||||
|     smulbt  r11, r5, r12            ; (ip[13] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     pkhtb   r7, r9, r7, asr #16     ; 5c | 4c |  | ||||||
|     pkhbt   r8, r8, r10, lsl #16    ; 5s | 4s |  | ||||||
|     uadd16  r6, r6, r7              ; 5c+5 | 4c+4 |  | ||||||
|  |  | ||||||
|     smulwt  r7, r4, r12             ; (ip[13] * sinpi8sqrt2) >> 16 |  | ||||||
|     smulbb  r9, r5, r12             ; (ip[12] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulwb  r10, r4, r12            ; (ip[12] * sinpi8sqrt2) >> 16 |  | ||||||
|  |  | ||||||
|     subs    r5, r5, #1<<31          ; i-- |  | ||||||
|  |  | ||||||
|     pkhtb   r9, r11, r9, asr #16    ; 13c | 12c |  | ||||||
|     ldr     r11, [r0]               ; i1 | i0 |  | ||||||
|     pkhbt   r10, r10, r7, lsl #16   ; 13s | 12s |  | ||||||
|     uadd16  r7, r12, r9             ; 13c+13 | 12c+12 |  | ||||||
|  |  | ||||||
|     usub16  r7, r8, r7              ; c |  | ||||||
|     uadd16  r6, r6, r10             ; d |  | ||||||
|     uadd16  r10, r11, r14           ; a |  | ||||||
|     usub16  r8, r11, r14            ; b |  | ||||||
|  |  | ||||||
|     uadd16  r9, r10, r6             ; a+d |  | ||||||
|     usub16  r10, r10, r6            ; a-d |  | ||||||
|     uadd16  r6, r8, r7              ; b+c |  | ||||||
|     usub16  r7, r8, r7              ; b-c |  | ||||||
|  |  | ||||||
|     ; use input buffer to store intermediate results |  | ||||||
|     str      r6, [r0, #(4*2)]       ; o5 | o4 |  | ||||||
|     str      r7, [r0, #(8*2)]       ; o9 | o8 |  | ||||||
|     str      r10,[r0, #(12*2)]      ; o13|o12 |  | ||||||
|     str      r9, [r0], #4           ; o1 | o0 |  | ||||||
|  |  | ||||||
|     bcs loop1_dual |  | ||||||
|  |  | ||||||
|     sub     r0, r0, #8              ; reset input/output |  | ||||||
|     str     r0, [sp] |  | ||||||
|  |  | ||||||
| loop2_dual |  | ||||||
|  |  | ||||||
|     ldr     r6, [r0, #(4*2)]        ; i5 | i4 |  | ||||||
|     ldr     r12,[r0, #(2*2)]        ; i3 | i2 |  | ||||||
|     ldr     r14,[r0, #(6*2)]        ; i7 | i6 |  | ||||||
|     ldr     r0, [r0, #(0*2)]        ; i1 | i0 |  | ||||||
|  |  | ||||||
|     smulbt  r9, r5, r6              ; (ip[5] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulbt  r7, r5, r0              ; (ip[1] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulwt  r10, r4, r6             ; (ip[5] * sinpi8sqrt2) >> 16 |  | ||||||
|     smulwt  r8, r4, r0              ; (ip[1] * sinpi8sqrt2) >> 16 |  | ||||||
|  |  | ||||||
|     pkhbt   r11, r6, r0, lsl #16    ; i0 | i4 |  | ||||||
|     pkhtb   r7, r7, r9, asr #16     ; 1c | 5c |  | ||||||
|     pkhtb   r0, r0, r6, asr #16     ; i1 | i5 |  | ||||||
|     pkhbt   r8, r10, r8, lsl #16    ; 1s | 5s = temp1 |  | ||||||
|  |  | ||||||
|     uadd16  r0, r7, r0              ; 1c+1 | 5c+5 = temp2 |  | ||||||
|     pkhbt   r9, r14, r12, lsl #16   ; i2 | i6 |  | ||||||
|     uadd16  r10, r11, r9            ; a |  | ||||||
|     usub16  r9, r11, r9             ; b |  | ||||||
|     pkhtb   r6, r12, r14, asr #16   ; i3 | i7 |  | ||||||
|  |  | ||||||
|     subs    r5, r5, #1<<31          ; i-- |  | ||||||
|  |  | ||||||
|     smulbt  r7, r5, r6              ; (ip[3] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulwt  r11, r4, r6             ; (ip[3] * sinpi8sqrt2) >> 16 |  | ||||||
|     smulbb  r12, r5, r6             ; (ip[7] * cospi8sqrt2minus1) >> 16 |  | ||||||
|     smulwb  r14, r4, r6             ; (ip[7] * sinpi8sqrt2) >> 16 |  | ||||||
|  |  | ||||||
|     pkhtb   r7, r7, r12, asr #16    ; 3c | 7c |  | ||||||
|     pkhbt   r11, r14, r11, lsl #16  ; 3s | 7s = temp1 |  | ||||||
|  |  | ||||||
|     uadd16  r6, r7, r6              ; 3c+3 | 7c+7 = temp2 |  | ||||||
|     usub16  r12, r8, r6             ; c (o1 | o5) |  | ||||||
|     uadd16  r6, r11, r0             ; d (o3 | o7) |  | ||||||
|     uadd16  r7, r10, r6             ; a+d |  | ||||||
|  |  | ||||||
|     mov     r8, #4                  ; set up 4's |  | ||||||
|     orr     r8, r8, #0x40000        ; 4|4 |  | ||||||
|  |  | ||||||
|     usub16  r6, r10, r6             ; a-d |  | ||||||
|     uadd16  r6, r6, r8              ; a-d+4, 3|7 |  | ||||||
|     uadd16  r7, r7, r8              ; a+d+4, 0|4 |  | ||||||
|     uadd16  r10, r9, r12            ; b+c |  | ||||||
|     usub16  r0, r9, r12             ; b-c |  | ||||||
|     uadd16  r10, r10, r8            ; b+c+4, 1|5 |  | ||||||
|     uadd16  r8, r0, r8              ; b-c+4, 2|6 |  | ||||||
|  |  | ||||||
|     ldr     lr, [sp, #40]           ; dst stride |  | ||||||
|  |  | ||||||
|     ldrb    r0, [r1]                ; pred p0 |  | ||||||
|     ldrb    r11, [r1, #1]           ; pred p1 |  | ||||||
|     ldrb    r12, [r1, #2]           ; pred p2 |  | ||||||
|  |  | ||||||
|     add     r0, r0, r7, asr #19     ; p0 + o0 |  | ||||||
|     add     r11, r11, r10, asr #19  ; p1 + o1 |  | ||||||
|     add     r12, r12, r8, asr #19   ; p2 + o2 |  | ||||||
|  |  | ||||||
|     usat    r0, #8, r0              ; d0 = clip8(p0 + o0) |  | ||||||
|     usat    r11, #8, r11            ; d1 = clip8(p1 + o1) |  | ||||||
|     usat    r12, #8, r12            ; d2 = clip8(p2 + o2) |  | ||||||
|  |  | ||||||
|     add     r0, r0, r11, lsl #8     ; |--|--|d1|d0| |  | ||||||
|  |  | ||||||
|     ldrb    r11, [r1, #3]           ; pred p3 |  | ||||||
|  |  | ||||||
|     add     r0, r0, r12, lsl #16    ; |--|d2|d1|d0| |  | ||||||
|  |  | ||||||
|     add     r11, r11, r6, asr #19   ; p3 + o3 |  | ||||||
|  |  | ||||||
|     sxth    r7, r7                  ; |  | ||||||
|     sxth    r10, r10                ; |  | ||||||
|  |  | ||||||
|     usat    r11, #8, r11            ; d3 = clip8(p3 + o3) |  | ||||||
|  |  | ||||||
|     sxth    r8, r8                  ; |  | ||||||
|     sxth    r6, r6                  ; |  | ||||||
|  |  | ||||||
|     add     r0, r0, r11, lsl #24    ; |d3|d2|d1|d0| |  | ||||||
|  |  | ||||||
|     ldrb    r12, [r1, r2]!          ; pred p4 |  | ||||||
|     str     r0, [r3], lr |  | ||||||
|     ldrb    r11, [r1, #1]           ; pred p5 |  | ||||||
|  |  | ||||||
|     add     r12, r12, r7, asr #3    ; p4 + o4 |  | ||||||
|     add     r11, r11, r10, asr #3   ; p5 + o5 |  | ||||||
|  |  | ||||||
|     usat    r12, #8, r12            ; d4 = clip8(p4 + o4) |  | ||||||
|     usat    r11, #8, r11            ; d5 = clip8(p5 + o5) |  | ||||||
|  |  | ||||||
|     ldrb    r7, [r1, #2]            ; pred p6 |  | ||||||
|     ldrb    r10, [r1, #3]           ; pred p6 |  | ||||||
|  |  | ||||||
|     add     r12, r12, r11, lsl #8   ; |--|--|d5|d4| |  | ||||||
|  |  | ||||||
|     add     r7, r7, r8, asr #3      ; p6 + o6 |  | ||||||
|     add     r10, r10, r6, asr #3    ; p7 + o7 |  | ||||||
|  |  | ||||||
|     ldr     r0, [sp]                ; load input pointer |  | ||||||
|  |  | ||||||
|     usat    r7, #8, r7              ; d6 = clip8(p6 + o6) |  | ||||||
|     usat    r10, #8, r10            ; d7 = clip8(p7 + o7) |  | ||||||
|  |  | ||||||
|     add     r12, r12, r7, lsl #16   ; |--|d6|d5|d4| |  | ||||||
|     add     r12, r12, r10, lsl #24  ; |d7|d6|d5|d4| |  | ||||||
|  |  | ||||||
|     str     r12, [r3], lr |  | ||||||
|     add     r0, r0, #16 |  | ||||||
|     add     r1, r1, r2              ; pred + pitch |  | ||||||
|  |  | ||||||
|     bcs loop2_dual |  | ||||||
|  |  | ||||||
|     add     sp, sp, #4              ; idct_output buffer |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,136 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|     EXPORT |vp8_short_inv_walsh4x4_v6| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
|  |  | ||||||
| ;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) |  | ||||||
| |vp8_short_inv_walsh4x4_v6| PROC |  | ||||||
|  |  | ||||||
|     stmdb       sp!, {r4 - r12, lr} |  | ||||||
|  |  | ||||||
|     ldr         r2, [r0, #0]         ; [1  |  0] |  | ||||||
|     ldr         r3, [r0, #4]         ; [3  |  2] |  | ||||||
|     ldr         r4, [r0, #8]         ; [5  |  4] |  | ||||||
|     ldr         r5, [r0, #12]        ; [7  |  6] |  | ||||||
|     ldr         r6, [r0, #16]        ; [9  |  8] |  | ||||||
|     ldr         r7, [r0, #20]        ; [11 | 10] |  | ||||||
|     ldr         r8, [r0, #24]        ; [13 | 12] |  | ||||||
|     ldr         r9, [r0, #28]        ; [15 | 14] |  | ||||||
|  |  | ||||||
|     qadd16      r10, r2, r8          ; a1 [1+13  |  0+12] |  | ||||||
|     qadd16      r11, r4, r6          ; b1 [5+9   |  4+8] |  | ||||||
|     qsub16      r12, r4, r6          ; c1 [5-9   |  4-8] |  | ||||||
|     qsub16      lr, r2, r8           ; d1 [1-13  |  0-12] |  | ||||||
|  |  | ||||||
|     qadd16      r2, r10, r11         ; a1 + b1 [1  |  0] |  | ||||||
|     qadd16      r4, r12, lr          ; c1 + d1 [5  |  4] |  | ||||||
|     qsub16      r6, r10, r11         ; a1 - b1 [9  |  8] |  | ||||||
|     qsub16      r8, lr, r12          ; d1 - c1 [13 | 12] |  | ||||||
|  |  | ||||||
|     qadd16      r10, r3, r9          ; a1 [3+15  |  2+14] |  | ||||||
|     qadd16      r11, r5, r7          ; b1 [7+11  |  6+10] |  | ||||||
|     qsub16      r12, r5, r7          ; c1 [7-11  |  6-10] |  | ||||||
|     qsub16      lr, r3, r9           ; d1 [3-15  |  2-14] |  | ||||||
|  |  | ||||||
|     qadd16      r3, r10, r11         ; a1 + b1 [3  |  2] |  | ||||||
|     qadd16      r5, r12, lr          ; c1 + d1 [7  |  6] |  | ||||||
|     qsub16      r7, r10, r11         ; a1 - b1 [11 | 10] |  | ||||||
|     qsub16      r9, lr, r12          ; d1 - c1 [15 | 14] |  | ||||||
|  |  | ||||||
|     ; first transform complete |  | ||||||
|  |  | ||||||
|     qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3] |  | ||||||
|     qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3] |  | ||||||
|     qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7] |  | ||||||
|     qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7] |  | ||||||
|  |  | ||||||
|     qaddsubx    r2, r10, r11         ; [b2|c2] [c1+d1 | a1-b1] |  | ||||||
|     qaddsubx    r3, r11, r10         ; [a2|d2] [b1+a1 | d1-c1] |  | ||||||
|     ldr         r10, c0x00030003 |  | ||||||
|     qaddsubx    r4, r12, lr          ; [b2|c2] [c1+d1 | a1-b1] |  | ||||||
|     qaddsubx    r5, lr, r12          ; [a2|d2] [b1+a1 | d1-c1] |  | ||||||
|  |  | ||||||
|     qadd16      r2, r2, r10          ; [b2+3|c2+3] |  | ||||||
|     qadd16      r3, r3, r10          ; [a2+3|d2+3] |  | ||||||
|     qadd16      r4, r4, r10          ; [b2+3|c2+3] |  | ||||||
|     qadd16      r5, r5, r10          ; [a2+3|d2+3] |  | ||||||
|  |  | ||||||
|     asr         r12, r3, #19         ; [0] |  | ||||||
|     strh        r12, [r1], #32 |  | ||||||
|     asr         lr, r2, #19          ; [1] |  | ||||||
|     strh        lr, [r1], #32 |  | ||||||
|     sxth        r2, r2 |  | ||||||
|     sxth        r3, r3 |  | ||||||
|     asr         r2, r2, #3           ; [2] |  | ||||||
|     strh        r2, [r1], #32 |  | ||||||
|     asr         r3, r3, #3           ; [3] |  | ||||||
|     strh        r3, [r1], #32 |  | ||||||
|  |  | ||||||
|     asr         r12, r5, #19         ; [4] |  | ||||||
|     strh        r12, [r1], #32 |  | ||||||
|     asr         lr, r4, #19          ; [5] |  | ||||||
|     strh        lr, [r1], #32 |  | ||||||
|     sxth        r4, r4 |  | ||||||
|     sxth        r5, r5 |  | ||||||
|     asr         r4, r4, #3           ; [6] |  | ||||||
|     strh        r4, [r1], #32 |  | ||||||
|     asr         r5, r5, #3           ; [7] |  | ||||||
|     strh        r5, [r1], #32 |  | ||||||
|  |  | ||||||
|     qsubaddx    r2, r6, r7           ; [c1|a1] [9-10  |  8+11] |  | ||||||
|     qaddsubx    r3, r6, r7           ; [b1|d1] [9+10  |  8-11] |  | ||||||
|     qsubaddx    r4, r8, r9           ; [c1|a1] [13-14 | 12+15] |  | ||||||
|     qaddsubx    r5, r8, r9           ; [b1|d1] [13+14 | 12-15] |  | ||||||
|  |  | ||||||
|     qaddsubx    r6, r2, r3           ; [b2|c2] [c1+d1 | a1-b1] |  | ||||||
|     qaddsubx    r7, r3, r2           ; [a2|d2] [b1+a1 | d1-c1] |  | ||||||
|     qaddsubx    r8, r4, r5           ; [b2|c2] [c1+d1 | a1-b1] |  | ||||||
|     qaddsubx    r9, r5, r4           ; [a2|d2] [b1+a1 | d1-c1] |  | ||||||
|  |  | ||||||
|     qadd16      r6, r6, r10          ; [b2+3|c2+3] |  | ||||||
|     qadd16      r7, r7, r10          ; [a2+3|d2+3] |  | ||||||
|     qadd16      r8, r8, r10          ; [b2+3|c2+3] |  | ||||||
|     qadd16      r9, r9, r10          ; [a2+3|d2+3] |  | ||||||
|  |  | ||||||
|     asr         r12, r7, #19         ; [8] |  | ||||||
|     strh        r12, [r1], #32 |  | ||||||
|     asr         lr, r6, #19          ; [9] |  | ||||||
|     strh        lr, [r1], #32 |  | ||||||
|     sxth        r6, r6 |  | ||||||
|     sxth        r7, r7 |  | ||||||
|     asr         r6, r6, #3           ; [10] |  | ||||||
|     strh        r6, [r1], #32 |  | ||||||
|     asr         r7, r7, #3           ; [11] |  | ||||||
|     strh        r7, [r1], #32 |  | ||||||
|  |  | ||||||
|     asr         r12, r9, #19         ; [12] |  | ||||||
|     strh        r12, [r1], #32 |  | ||||||
|     asr         lr, r8, #19          ; [13] |  | ||||||
|     strh        lr, [r1], #32 |  | ||||||
|     sxth        r8, r8 |  | ||||||
|     sxth        r9, r9 |  | ||||||
|     asr         r8, r8, #3           ; [14] |  | ||||||
|     strh        r8, [r1], #32 |  | ||||||
|     asr         r9, r9, #3           ; [15] |  | ||||||
|     strh        r9, [r1], #32 |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r12, pc} |  | ||||||
|     ENDP        ; |vp8_short_inv_walsh4x4_v6| |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ; Constant Pool |  | ||||||
| c0x00030003 DCD 0x00030003 |  | ||||||
|     END |  | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,286 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| |  | ||||||
|     EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
|  |  | ||||||
|     MACRO |  | ||||||
|     TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 |  | ||||||
|     ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 |  | ||||||
|     ; a0: 03 02 01 00 |  | ||||||
|     ; a1: 13 12 11 10 |  | ||||||
|     ; a2: 23 22 21 20 |  | ||||||
|     ; a3: 33 32 31 30 |  | ||||||
|     ;     b3 b2 b1 b0 |  | ||||||
|  |  | ||||||
|     uxtb16      $b1, $a1                    ; xx 12 xx 10 |  | ||||||
|     uxtb16      $b0, $a0                    ; xx 02 xx 00 |  | ||||||
|     uxtb16      $b3, $a3                    ; xx 32 xx 30 |  | ||||||
|     uxtb16      $b2, $a2                    ; xx 22 xx 20 |  | ||||||
|     orr         $b1, $b0, $b1, lsl #8       ; 12 02 10 00 |  | ||||||
|     orr         $b3, $b2, $b3, lsl #8       ; 32 22 30 20 |  | ||||||
|  |  | ||||||
|     uxtb16      $a1, $a1, ror #8            ; xx 13 xx 11 |  | ||||||
|     uxtb16      $a3, $a3, ror #8            ; xx 33 xx 31 |  | ||||||
|     uxtb16      $a0, $a0, ror #8            ; xx 03 xx 01 |  | ||||||
|     uxtb16      $a2, $a2, ror #8            ; xx 23 xx 21 |  | ||||||
|     orr         $a0, $a0, $a1, lsl #8       ; 13 03 11 01 |  | ||||||
|     orr         $a2, $a2, $a3, lsl #8       ; 33 23 31 21 |  | ||||||
|  |  | ||||||
|     pkhtb       $b2, $b3, $b1, asr #16      ; 32 22 12 02   -- p1 |  | ||||||
|     pkhbt       $b0, $b1, $b3, lsl #16      ; 30 20 10 00   -- p3 |  | ||||||
|  |  | ||||||
|     pkhtb       $b3, $a2, $a0, asr #16      ; 33 23 13 03   -- p0 |  | ||||||
|     pkhbt       $b1, $a0, $a2, lsl #16      ; 31 21 11 01   -- p2 |  | ||||||
|     MEND |  | ||||||
|  |  | ||||||
|  |  | ||||||
|  |  | ||||||
| src         RN  r0 |  | ||||||
| pstep       RN  r1 |  | ||||||
|  |  | ||||||
| ;r0     unsigned char *src_ptr, |  | ||||||
| ;r1     int src_pixel_step, |  | ||||||
| ;r2     const char *blimit |  | ||||||
|  |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |  | ||||||
| |vp8_loop_filter_simple_horizontal_edge_armv6| PROC |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |  | ||||||
|     stmdb       sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldrb        r12, [r2]                   ; blimit |  | ||||||
|     ldr         r3, [src, -pstep, lsl #1]   ; p1 |  | ||||||
|     ldr         r4, [src, -pstep]           ; p0 |  | ||||||
|     ldr         r5, [src]                   ; q0 |  | ||||||
|     ldr         r6, [src, pstep]            ; q1 |  | ||||||
|     orr         r12, r12, r12, lsl #8       ; blimit |  | ||||||
|     ldr         r2, c0x80808080 |  | ||||||
|     orr         r12, r12, r12, lsl #16      ; blimit |  | ||||||
|     mov         r9, #4                      ; double the count. we're doing 4 at a time |  | ||||||
|     mov         lr, #0                      ; need 0 in a couple places |  | ||||||
|  |  | ||||||
| |simple_hnext8| |  | ||||||
|     ; vp8_simple_filter_mask() |  | ||||||
|  |  | ||||||
|     uqsub8      r7, r3, r6                  ; p1 - q1 |  | ||||||
|     uqsub8      r8, r6, r3                  ; q1 - p1 |  | ||||||
|     uqsub8      r10, r4, r5                 ; p0 - q0 |  | ||||||
|     uqsub8      r11, r5, r4                 ; q0 - p0 |  | ||||||
|     orr         r8, r8, r7                  ; abs(p1 - q1) |  | ||||||
|     orr         r10, r10, r11               ; abs(p0 - q0) |  | ||||||
|     uqadd8      r10, r10, r10               ; abs(p0 - q0) * 2 |  | ||||||
|     uhadd8      r8, r8, lr                  ; abs(p1 - q2) >> 1 |  | ||||||
|     uqadd8      r10, r10, r8                ; abs(p0 - q0)*2 + abs(p1 - q1)/2 |  | ||||||
|     mvn         r8, #0 |  | ||||||
|     usub8       r10, r12, r10               ; compare to flimit. usub8 sets GE flags |  | ||||||
|     sel         r10, r8, lr                 ; filter mask: F or 0 |  | ||||||
|     cmp         r10, #0 |  | ||||||
|     beq         simple_hskip_filter         ; skip filtering if all masks are 0x00 |  | ||||||
|  |  | ||||||
|     ;vp8_simple_filter() |  | ||||||
|  |  | ||||||
|     eor         r3, r3, r2                  ; p1 offset to convert to a signed value |  | ||||||
|     eor         r6, r6, r2                  ; q1 offset to convert to a signed value |  | ||||||
|     eor         r4, r4, r2                  ; p0 offset to convert to a signed value |  | ||||||
|     eor         r5, r5, r2                  ; q0 offset to convert to a signed value |  | ||||||
|  |  | ||||||
|     qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1 |  | ||||||
|     qsub8       r6, r5, r4                  ; q0 - p0 |  | ||||||
|     qadd8       r3, r3, r6                  ; += q0 - p0 |  | ||||||
|     ldr         r7, c0x04040404 |  | ||||||
|     qadd8       r3, r3, r6                  ; += q0 - p0 |  | ||||||
|     ldr         r8, c0x03030303 |  | ||||||
|     qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0)) |  | ||||||
|     ;STALL |  | ||||||
|     and         r3, r3, r10                 ; vp8_filter &= mask |  | ||||||
|  |  | ||||||
|     qadd8       r7 , r3 , r7                ; Filter1 = vp8_filter + 4 |  | ||||||
|     qadd8       r8 , r3 , r8                ; Filter2 = vp8_filter + 3 |  | ||||||
|  |  | ||||||
|     shadd8      r7 , r7 , lr |  | ||||||
|     shadd8      r8 , r8 , lr |  | ||||||
|     shadd8      r7 , r7 , lr |  | ||||||
|     shadd8      r8 , r8 , lr |  | ||||||
|     shadd8      r7 , r7 , lr                ; Filter1 >>= 3 |  | ||||||
|     shadd8      r8 , r8 , lr                ; Filter2 >>= 3 |  | ||||||
|  |  | ||||||
|     qsub8       r5 ,r5, r7                  ; u = q0 - Filter1 |  | ||||||
|     qadd8       r4, r4, r8                  ; u = p0 + Filter2 |  | ||||||
|     eor         r5, r5, r2                  ; *oq0 = u^0x80 |  | ||||||
|     str         r5, [src]                   ; store oq0 result |  | ||||||
|     eor         r4, r4, r2                  ; *op0 = u^0x80 |  | ||||||
|     str         r4, [src, -pstep]           ; store op0 result |  | ||||||
|  |  | ||||||
| |simple_hskip_filter| |  | ||||||
|     subs        r9, r9, #1 |  | ||||||
|     addne       src, src, #4                ; next row |  | ||||||
|  |  | ||||||
|     ldrne       r3, [src, -pstep, lsl #1]   ; p1 |  | ||||||
|     ldrne       r4, [src, -pstep]           ; p0 |  | ||||||
|     ldrne       r5, [src]                   ; q0 |  | ||||||
|     ldrne       r6, [src, pstep]            ; q1 |  | ||||||
|  |  | ||||||
|     bne         simple_hnext8 |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r11, pc} |  | ||||||
|     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_armv6| |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |  | ||||||
| |vp8_loop_filter_simple_vertical_edge_armv6| PROC |  | ||||||
| ;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- |  | ||||||
|     stmdb       sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldrb        r12, [r2]                   ; r12: blimit |  | ||||||
|     ldr         r2, c0x80808080 |  | ||||||
|     orr         r12, r12, r12, lsl #8 |  | ||||||
|  |  | ||||||
|     ; load soure data to r7, r8, r9, r10 |  | ||||||
|     ldrh        r3, [src, #-2] |  | ||||||
|     pld         [src, #23]                  ; preload for next block |  | ||||||
|     ldrh        r4, [src], pstep |  | ||||||
|     orr         r12, r12, r12, lsl #16 |  | ||||||
|  |  | ||||||
|     ldrh        r5, [src, #-2] |  | ||||||
|     pld         [src, #23] |  | ||||||
|     ldrh        r6, [src], pstep |  | ||||||
|  |  | ||||||
|     pkhbt       r7, r3, r4, lsl #16 |  | ||||||
|  |  | ||||||
|     ldrh        r3, [src, #-2] |  | ||||||
|     pld         [src, #23] |  | ||||||
|     ldrh        r4, [src], pstep |  | ||||||
|  |  | ||||||
|     pkhbt       r8, r5, r6, lsl #16 |  | ||||||
|  |  | ||||||
|     ldrh        r5, [src, #-2] |  | ||||||
|     pld         [src, #23] |  | ||||||
|     ldrh        r6, [src], pstep |  | ||||||
|     mov         r11, #4                     ; double the count. we're doing 4 at a time |  | ||||||
|  |  | ||||||
| |simple_vnext8| |  | ||||||
|     ; vp8_simple_filter_mask() function |  | ||||||
|     pkhbt       r9, r3, r4, lsl #16 |  | ||||||
|     pkhbt       r10, r5, r6, lsl #16 |  | ||||||
|  |  | ||||||
|     ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 |  | ||||||
|     TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 |  | ||||||
|  |  | ||||||
|     uqsub8      r7, r3, r6                  ; p1 - q1 |  | ||||||
|     uqsub8      r8, r6, r3                  ; q1 - p1 |  | ||||||
|     uqsub8      r9, r4, r5                  ; p0 - q0 |  | ||||||
|     uqsub8      r10, r5, r4                 ; q0 - p0 |  | ||||||
|     orr         r7, r7, r8                  ; abs(p1 - q1) |  | ||||||
|     orr         r9, r9, r10                 ; abs(p0 - q0) |  | ||||||
|     mov         r8, #0 |  | ||||||
|     uqadd8      r9, r9, r9                  ; abs(p0 - q0) * 2 |  | ||||||
|     uhadd8      r7, r7, r8                  ; abs(p1 - q1) / 2 |  | ||||||
|     uqadd8      r7, r7, r9                  ; abs(p0 - q0)*2 + abs(p1 - q1)/2 |  | ||||||
|     mvn         r10, #0                     ; r10 == -1 |  | ||||||
|  |  | ||||||
|     usub8       r7, r12, r7                 ; compare to flimit |  | ||||||
|     sel         lr, r10, r8                 ; filter mask |  | ||||||
|  |  | ||||||
|     cmp         lr, #0 |  | ||||||
|     beq         simple_vskip_filter         ; skip filtering |  | ||||||
|  |  | ||||||
|     ;vp8_simple_filter() function |  | ||||||
|     eor         r3, r3, r2                  ; p1 offset to convert to a signed value |  | ||||||
|     eor         r6, r6, r2                  ; q1 offset to convert to a signed value |  | ||||||
|     eor         r4, r4, r2                  ; p0 offset to convert to a signed value |  | ||||||
|     eor         r5, r5, r2                  ; q0 offset to convert to a signed value |  | ||||||
|  |  | ||||||
|     qsub8       r3, r3, r6                  ; vp8_filter = p1 - q1 |  | ||||||
|     qsub8       r6, r5, r4                  ; q0 - p0 |  | ||||||
|  |  | ||||||
|     qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0 |  | ||||||
|     ldr         r9, c0x03030303             ; r9 = 3 |  | ||||||
|  |  | ||||||
|     qadd8       r3, r3, r6                  ; vp8_filter += q0 - p0 |  | ||||||
|     ldr         r7, c0x04040404 |  | ||||||
|  |  | ||||||
|     qadd8       r3, r3, r6                  ; vp8_filter = p1-q1 + 3*(q0-p0)) |  | ||||||
|     ;STALL |  | ||||||
|     and         r3, r3, lr                  ; vp8_filter &= mask |  | ||||||
|  |  | ||||||
|     qadd8       r9 , r3 , r9                ; Filter2 = vp8_filter + 3 |  | ||||||
|     qadd8       r3 , r3 , r7                ; Filter1 = vp8_filter + 4 |  | ||||||
|  |  | ||||||
|     shadd8      r9 , r9 , r8 |  | ||||||
|     shadd8      r3 , r3 , r8 |  | ||||||
|     shadd8      r9 , r9 , r8 |  | ||||||
|     shadd8      r3 , r3 , r8 |  | ||||||
|     shadd8      r9 , r9 , r8                ; Filter2 >>= 3 |  | ||||||
|     shadd8      r3 , r3 , r8                ; Filter1 >>= 3 |  | ||||||
|  |  | ||||||
|     ;calculate output |  | ||||||
|     sub         src, src, pstep, lsl #2 |  | ||||||
|  |  | ||||||
|     qadd8       r4, r4, r9                  ; u = p0 + Filter2 |  | ||||||
|     qsub8       r5, r5, r3                  ; u = q0 - Filter1 |  | ||||||
|     eor         r4, r4, r2                  ; *op0 = u^0x80 |  | ||||||
|     eor         r5, r5, r2                  ; *oq0 = u^0x80 |  | ||||||
|  |  | ||||||
|     strb        r4, [src, #-1]              ; store the result |  | ||||||
|     mov         r4, r4, lsr #8 |  | ||||||
|     strb        r5, [src], pstep |  | ||||||
|     mov         r5, r5, lsr #8 |  | ||||||
|  |  | ||||||
|     strb        r4, [src, #-1] |  | ||||||
|     mov         r4, r4, lsr #8 |  | ||||||
|     strb        r5, [src], pstep |  | ||||||
|     mov         r5, r5, lsr #8 |  | ||||||
|  |  | ||||||
|     strb        r4, [src, #-1] |  | ||||||
|     mov         r4, r4, lsr #8 |  | ||||||
|     strb        r5, [src], pstep |  | ||||||
|     mov         r5, r5, lsr #8 |  | ||||||
|  |  | ||||||
|     strb        r4, [src, #-1] |  | ||||||
|     strb        r5, [src], pstep |  | ||||||
|  |  | ||||||
| |simple_vskip_filter| |  | ||||||
|     subs        r11, r11, #1 |  | ||||||
|  |  | ||||||
|     ; load soure data to r7, r8, r9, r10 |  | ||||||
|     ldrneh      r3, [src, #-2] |  | ||||||
|     pld         [src, #23]                  ; preload for next block |  | ||||||
|     ldrneh      r4, [src], pstep |  | ||||||
|  |  | ||||||
|     ldrneh      r5, [src, #-2] |  | ||||||
|     pld         [src, #23] |  | ||||||
|     ldrneh      r6, [src], pstep |  | ||||||
|  |  | ||||||
|     pkhbt       r7, r3, r4, lsl #16 |  | ||||||
|  |  | ||||||
|     ldrneh      r3, [src, #-2] |  | ||||||
|     pld         [src, #23] |  | ||||||
|     ldrneh      r4, [src], pstep |  | ||||||
|  |  | ||||||
|     pkhbt       r8, r5, r6, lsl #16 |  | ||||||
|  |  | ||||||
|     ldrneh      r5, [src, #-2] |  | ||||||
|     pld         [src, #23] |  | ||||||
|     ldrneh      r6, [src], pstep |  | ||||||
|  |  | ||||||
|     bne         simple_vnext8 |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r11, pc} |  | ||||||
|     ENDP        ; |vp8_loop_filter_simple_vertical_edge_armv6| |  | ||||||
|  |  | ||||||
| ; Constant Pool |  | ||||||
| c0x80808080 DCD     0x80808080 |  | ||||||
| c0x03030303 DCD     0x03030303 |  | ||||||
| c0x04040404 DCD     0x04040404 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,273 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vp8_sixtap_predict8x4_armv6| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
| ;------------------------------------- |  | ||||||
| ; r0    unsigned char *src_ptr, |  | ||||||
| ; r1    int  src_pixels_per_line, |  | ||||||
| ; r2    int  xoffset, |  | ||||||
| ; r3    int  yoffset, |  | ||||||
| ; stack unsigned char *dst_ptr, |  | ||||||
| ; stack int  dst_pitch |  | ||||||
| ;------------------------------------- |  | ||||||
| ;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. |  | ||||||
| ;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, |  | ||||||
| ;and the result is stored in transpose. |  | ||||||
| |vp8_sixtap_predict8x4_armv6| PROC |  | ||||||
|     stmdb       sp!, {r4 - r11, lr} |  | ||||||
|     str         r3, [sp, #-184]!            ;reserve space on stack for temporary storage, store yoffset |  | ||||||
|  |  | ||||||
|     cmp         r2, #0                      ;skip first_pass filter if xoffset=0 |  | ||||||
|     add         lr, sp, #4                  ;point to temporary buffer |  | ||||||
|     beq         skip_firstpass_filter |  | ||||||
|  |  | ||||||
| ;first-pass filter |  | ||||||
|     adr         r12, filter8_coeff |  | ||||||
|     sub         r0, r0, r1, lsl #1 |  | ||||||
|  |  | ||||||
|     add         r3, r1, #10                 ; preload next low |  | ||||||
|     pld         [r0, r3] |  | ||||||
|  |  | ||||||
|     add         r2, r12, r2, lsl #4         ;calculate filter location |  | ||||||
|     add         r0, r0, #3                  ;adjust src only for loading convinience |  | ||||||
|  |  | ||||||
|     ldr         r3, [r2]                    ; load up packed filter coefficients |  | ||||||
|     ldr         r4, [r2, #4] |  | ||||||
|     ldr         r5, [r2, #8] |  | ||||||
|  |  | ||||||
|     mov         r2, #0x90000                ; height=9 is top part of counter |  | ||||||
|  |  | ||||||
|     sub         r1, r1, #8 |  | ||||||
|  |  | ||||||
| |first_pass_hloop_v6| |  | ||||||
|     ldrb        r6, [r0, #-5]               ; load source data |  | ||||||
|     ldrb        r7, [r0, #-4] |  | ||||||
|     ldrb        r8, [r0, #-3] |  | ||||||
|     ldrb        r9, [r0, #-2] |  | ||||||
|     ldrb        r10, [r0, #-1] |  | ||||||
|  |  | ||||||
|     orr         r2, r2, #0x4                ; construct loop counter. width=8=4x2 |  | ||||||
|  |  | ||||||
|     pkhbt       r6, r6, r7, lsl #16         ; r7 | r6 |  | ||||||
|     pkhbt       r7, r7, r8, lsl #16         ; r8 | r7 |  | ||||||
|  |  | ||||||
|     pkhbt       r8, r8, r9, lsl #16         ; r9 | r8 |  | ||||||
|     pkhbt       r9, r9, r10, lsl #16        ; r10 | r9 |  | ||||||
|  |  | ||||||
| |first_pass_wloop_v6| |  | ||||||
|     smuad       r11, r6, r3                 ; vp8_filter[0], vp8_filter[1] |  | ||||||
|     smuad       r12, r7, r3 |  | ||||||
|  |  | ||||||
|     ldrb        r6, [r0], #1 |  | ||||||
|  |  | ||||||
|     smlad       r11, r8, r4, r11            ; vp8_filter[2], vp8_filter[3] |  | ||||||
|     ldrb        r7, [r0], #1 |  | ||||||
|     smlad       r12, r9, r4, r12 |  | ||||||
|  |  | ||||||
|     pkhbt       r10, r10, r6, lsl #16       ; r10 | r9 |  | ||||||
|     pkhbt       r6, r6, r7, lsl #16         ; r11 | r10 |  | ||||||
|     smlad       r11, r10, r5, r11           ; vp8_filter[4], vp8_filter[5] |  | ||||||
|     smlad       r12, r6, r5, r12 |  | ||||||
|  |  | ||||||
|     sub         r2, r2, #1 |  | ||||||
|  |  | ||||||
|     add         r11, r11, #0x40             ; round_shift_and_clamp |  | ||||||
|     tst         r2, #0xff                   ; test loop counter |  | ||||||
|     usat        r11, #8, r11, asr #7 |  | ||||||
|     add         r12, r12, #0x40 |  | ||||||
|     strh        r11, [lr], #20              ; result is transposed and stored, which |  | ||||||
|     usat        r12, #8, r12, asr #7 |  | ||||||
|  |  | ||||||
|     strh        r12, [lr], #20 |  | ||||||
|  |  | ||||||
|     movne       r11, r6 |  | ||||||
|     movne       r12, r7 |  | ||||||
|  |  | ||||||
|     movne       r6, r8 |  | ||||||
|     movne       r7, r9 |  | ||||||
|     movne       r8, r10 |  | ||||||
|     movne       r9, r11 |  | ||||||
|     movne       r10, r12 |  | ||||||
|  |  | ||||||
|     bne         first_pass_wloop_v6 |  | ||||||
|  |  | ||||||
|     ;;add       r9, ppl, #30                ; attempt to load 2 adjacent cache lines |  | ||||||
|     ;;IF ARCHITECTURE=6 |  | ||||||
|     ;pld        [src, ppl] |  | ||||||
|     ;;pld       [src, r9] |  | ||||||
|     ;;ENDIF |  | ||||||
|  |  | ||||||
|     subs        r2, r2, #0x10000 |  | ||||||
|  |  | ||||||
|     sub         lr, lr, #158 |  | ||||||
|  |  | ||||||
|     add         r0, r0, r1                  ; move to next input line |  | ||||||
|  |  | ||||||
|     add         r11, r1, #18                ; preload next low. adding back block width(=8), which is subtracted earlier |  | ||||||
|     pld         [r0, r11] |  | ||||||
|  |  | ||||||
|     bne         first_pass_hloop_v6 |  | ||||||
|  |  | ||||||
| ;second pass filter |  | ||||||
| secondpass_filter |  | ||||||
|     ldr         r3, [sp], #4                ; load back yoffset |  | ||||||
|     ldr         r0, [sp, #216]              ; load dst address from stack 180+36 |  | ||||||
|     ldr         r1, [sp, #220]              ; load dst stride from stack 180+40 |  | ||||||
|  |  | ||||||
|     cmp         r3, #0 |  | ||||||
|     beq         skip_secondpass_filter |  | ||||||
|  |  | ||||||
|     adr         r12, filter8_coeff |  | ||||||
|     add         lr, r12, r3, lsl #4         ;calculate filter location |  | ||||||
|  |  | ||||||
|     mov         r2, #0x00080000 |  | ||||||
|  |  | ||||||
|     ldr         r3, [lr]                    ; load up packed filter coefficients |  | ||||||
|     ldr         r4, [lr, #4] |  | ||||||
|     ldr         r5, [lr, #8] |  | ||||||
|  |  | ||||||
|     pkhbt       r12, r4, r3                 ; pack the filter differently |  | ||||||
|     pkhbt       r11, r5, r4 |  | ||||||
|  |  | ||||||
| second_pass_hloop_v6 |  | ||||||
|     ldr         r6, [sp]                    ; load the data |  | ||||||
|     ldr         r7, [sp, #4] |  | ||||||
|  |  | ||||||
|     orr         r2, r2, #2                  ; loop counter |  | ||||||
|  |  | ||||||
| second_pass_wloop_v6 |  | ||||||
|     smuad       lr, r3, r6                  ; apply filter |  | ||||||
|     smulbt      r10, r3, r6 |  | ||||||
|  |  | ||||||
|     ldr         r8, [sp, #8] |  | ||||||
|  |  | ||||||
|     smlad       lr, r4, r7, lr |  | ||||||
|     smladx      r10, r12, r7, r10 |  | ||||||
|  |  | ||||||
|     ldrh        r9, [sp, #12] |  | ||||||
|  |  | ||||||
|     smlad       lr, r5, r8, lr |  | ||||||
|     smladx      r10, r11, r8, r10 |  | ||||||
|  |  | ||||||
|     add         sp, sp, #4 |  | ||||||
|     smlatb      r10, r5, r9, r10 |  | ||||||
|  |  | ||||||
|     sub         r2, r2, #1 |  | ||||||
|  |  | ||||||
|     add         lr, lr, #0x40               ; round_shift_and_clamp |  | ||||||
|     tst         r2, #0xff |  | ||||||
|     usat        lr, #8, lr, asr #7 |  | ||||||
|     add         r10, r10, #0x40 |  | ||||||
|     strb        lr, [r0], r1                ; the result is transposed back and stored |  | ||||||
|     usat        r10, #8, r10, asr #7 |  | ||||||
|  |  | ||||||
|     strb        r10, [r0],r1 |  | ||||||
|  |  | ||||||
|     movne       r6, r7 |  | ||||||
|     movne       r7, r8 |  | ||||||
|  |  | ||||||
|     bne         second_pass_wloop_v6 |  | ||||||
|  |  | ||||||
|     subs        r2, r2, #0x10000 |  | ||||||
|     add         sp, sp, #12                 ; updata src for next loop (20-8) |  | ||||||
|     sub         r0, r0, r1, lsl #2 |  | ||||||
|     add         r0, r0, #1 |  | ||||||
|  |  | ||||||
|     bne         second_pass_hloop_v6 |  | ||||||
|  |  | ||||||
|     add         sp, sp, #20 |  | ||||||
|     ldmia       sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
| ;-------------------- |  | ||||||
| skip_firstpass_filter |  | ||||||
|     sub         r0, r0, r1, lsl #1 |  | ||||||
|     sub         r1, r1, #8 |  | ||||||
|     mov         r2, #9 |  | ||||||
|  |  | ||||||
| skip_firstpass_hloop |  | ||||||
|     ldrb        r4, [r0], #1                ; load data |  | ||||||
|     subs        r2, r2, #1 |  | ||||||
|     ldrb        r5, [r0], #1 |  | ||||||
|     strh        r4, [lr], #20               ; store it to immediate buffer |  | ||||||
|     ldrb        r6, [r0], #1                ; load data |  | ||||||
|     strh        r5, [lr], #20 |  | ||||||
|     ldrb        r7, [r0], #1 |  | ||||||
|     strh        r6, [lr], #20 |  | ||||||
|     ldrb        r8, [r0], #1 |  | ||||||
|     strh        r7, [lr], #20 |  | ||||||
|     ldrb        r9, [r0], #1 |  | ||||||
|     strh        r8, [lr], #20 |  | ||||||
|     ldrb        r10, [r0], #1 |  | ||||||
|     strh        r9, [lr], #20 |  | ||||||
|     ldrb        r11, [r0], #1 |  | ||||||
|     strh        r10, [lr], #20 |  | ||||||
|     add         r0, r0, r1                  ; move to next input line |  | ||||||
|     strh        r11, [lr], #20 |  | ||||||
|  |  | ||||||
|     sub         lr, lr, #158                ; move over to next column |  | ||||||
|     bne         skip_firstpass_hloop |  | ||||||
|  |  | ||||||
|     b           secondpass_filter |  | ||||||
|  |  | ||||||
| ;-------------------- |  | ||||||
| skip_secondpass_filter |  | ||||||
|     mov         r2, #8 |  | ||||||
|     add         sp, sp, #4                  ;start from src[0] instead of src[-2] |  | ||||||
|  |  | ||||||
| skip_secondpass_hloop |  | ||||||
|     ldr         r6, [sp], #4 |  | ||||||
|     subs        r2, r2, #1 |  | ||||||
|     ldr         r8, [sp], #4 |  | ||||||
|  |  | ||||||
|     mov         r7, r6, lsr #16             ; unpack |  | ||||||
|     strb        r6, [r0], r1 |  | ||||||
|     mov         r9, r8, lsr #16 |  | ||||||
|     strb        r7, [r0], r1 |  | ||||||
|     add         sp, sp, #12                 ; 20-8 |  | ||||||
|     strb        r8, [r0], r1 |  | ||||||
|     strb        r9, [r0], r1 |  | ||||||
|  |  | ||||||
|     sub         r0, r0, r1, lsl #2 |  | ||||||
|     add         r0, r0, #1 |  | ||||||
|  |  | ||||||
|     bne         skip_secondpass_hloop |  | ||||||
|  |  | ||||||
|     add         sp, sp, #16                 ; 180 - (160 +4) |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ;----------------- |  | ||||||
| ;One word each is reserved. Label filter_coeff can be used to access the data. |  | ||||||
| ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... |  | ||||||
| filter8_coeff |  | ||||||
|     DCD     0x00000000,     0x00000080,     0x00000000,     0x00000000 |  | ||||||
|     DCD     0xfffa0000,     0x000c007b,     0x0000ffff,     0x00000000 |  | ||||||
|     DCD     0xfff50002,     0x0024006c,     0x0001fff8,     0x00000000 |  | ||||||
|     DCD     0xfff70000,     0x0032005d,     0x0000fffa,     0x00000000 |  | ||||||
|     DCD     0xfff00003,     0x004d004d,     0x0003fff0,     0x00000000 |  | ||||||
|     DCD     0xfffa0000,     0x005d0032,     0x0000fff7,     0x00000000 |  | ||||||
|     DCD     0xfff80001,     0x006c0024,     0x0002fff5,     0x00000000 |  | ||||||
|     DCD     0xffff0000,     0x007b000c,     0x0000fffa,     0x00000000 |  | ||||||
|  |  | ||||||
|     ;DCD        0,  0,  128,    0,   0,  0 |  | ||||||
|     ;DCD        0, -6,  123,   12,  -1,  0 |  | ||||||
|     ;DCD        2, -11, 108,   36,  -8,  1 |  | ||||||
|     ;DCD        0, -9,   93,   50,  -6,  0 |  | ||||||
|     ;DCD        3, -16,  77,   77, -16,  3 |  | ||||||
|     ;DCD        0, -6,   50,   93,  -9,  0 |  | ||||||
|     ;DCD        1, -8,   36,  108, -11,  2 |  | ||||||
|     ;DCD        0, -1,   12,  123,  -6,  0 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,87 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include "vpx_config.h" |  | ||||||
| #include "vp8_rtcd.h" |  | ||||||
| #include <math.h> |  | ||||||
| #include "vp8/common/filter.h" |  | ||||||
| #include "bilinearfilter_arm.h" |  | ||||||
|  |  | ||||||
| void vp8_filter_block2d_bil_armv6(unsigned char *src_ptr, |  | ||||||
|                                   unsigned char *dst_ptr, |  | ||||||
|                                   unsigned int src_pitch, |  | ||||||
|                                   unsigned int dst_pitch, const short *HFilter, |  | ||||||
|                                   const short *VFilter, int Width, int Height) { |  | ||||||
|   unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */ |  | ||||||
|  |  | ||||||
|   /* First filter 1-D horizontally... */ |  | ||||||
|   vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, |  | ||||||
|                                           Width, HFilter); |  | ||||||
|  |  | ||||||
|   /* then 1-D vertically... */ |  | ||||||
|   vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, |  | ||||||
|                                            Width, VFilter); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_bilinear_predict4x4_armv6(unsigned char *src_ptr, |  | ||||||
|                                    int src_pixels_per_line, int xoffset, |  | ||||||
|                                    int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                    int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|  |  | ||||||
|   HFilter = vp8_bilinear_filters[xoffset]; |  | ||||||
|   VFilter = vp8_bilinear_filters[yoffset]; |  | ||||||
|  |  | ||||||
|   vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, |  | ||||||
|                                HFilter, VFilter, 4, 4); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_bilinear_predict8x8_armv6(unsigned char *src_ptr, |  | ||||||
|                                    int src_pixels_per_line, int xoffset, |  | ||||||
|                                    int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                    int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|  |  | ||||||
|   HFilter = vp8_bilinear_filters[xoffset]; |  | ||||||
|   VFilter = vp8_bilinear_filters[yoffset]; |  | ||||||
|  |  | ||||||
|   vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, |  | ||||||
|                                HFilter, VFilter, 8, 8); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_bilinear_predict8x4_armv6(unsigned char *src_ptr, |  | ||||||
|                                    int src_pixels_per_line, int xoffset, |  | ||||||
|                                    int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                    int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|  |  | ||||||
|   HFilter = vp8_bilinear_filters[xoffset]; |  | ||||||
|   VFilter = vp8_bilinear_filters[yoffset]; |  | ||||||
|  |  | ||||||
|   vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, |  | ||||||
|                                HFilter, VFilter, 8, 4); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_bilinear_predict16x16_armv6(unsigned char *src_ptr, |  | ||||||
|                                      int src_pixels_per_line, int xoffset, |  | ||||||
|                                      int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                      int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|  |  | ||||||
|   HFilter = vp8_bilinear_filters[xoffset]; |  | ||||||
|   VFilter = vp8_bilinear_filters[yoffset]; |  | ||||||
|  |  | ||||||
|   vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, |  | ||||||
|                                HFilter, VFilter, 16, 16); |  | ||||||
| } |  | ||||||
| @@ -1,31 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ |  | ||||||
| #define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ |  | ||||||
|  |  | ||||||
| #ifdef __cplusplus |  | ||||||
| extern "C" { |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| extern void vp8_filter_block2d_bil_first_pass_armv6( |  | ||||||
|     const unsigned char *src_ptr, unsigned short *dst_ptr, |  | ||||||
|     unsigned int src_pitch, unsigned int height, unsigned int width, |  | ||||||
|     const short *vp8_filter); |  | ||||||
|  |  | ||||||
| extern void vp8_filter_block2d_bil_second_pass_armv6( |  | ||||||
|     const unsigned short *src_ptr, unsigned char *dst_ptr, int dst_pitch, |  | ||||||
|     unsigned int height, unsigned int width, const short *vp8_filter); |  | ||||||
|  |  | ||||||
| #ifdef __cplusplus |  | ||||||
| }  // extern "C" |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #endif  // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ |  | ||||||
| @@ -1,23 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include "vpx_config.h" |  | ||||||
| #include "vp8/common/blockd.h" |  | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
| extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); |  | ||||||
|  |  | ||||||
| void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) { |  | ||||||
|   short *DQ = d->dqcoeff; |  | ||||||
|   short *Q = d->qcoeff; |  | ||||||
|  |  | ||||||
|   vp8_dequantize_b_loop_v6(Q, DQC, DQ); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| @@ -1,176 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include "vpx_config.h" |  | ||||||
| #include "vp8_rtcd.h" |  | ||||||
| #include <math.h> |  | ||||||
| #include "vp8/common/filter.h" |  | ||||||
| #include "vpx_ports/mem.h" |  | ||||||
|  |  | ||||||
| extern void vp8_filter_block2d_first_pass_armv6( |  | ||||||
|     unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line, |  | ||||||
|     unsigned int output_width, unsigned int output_height, |  | ||||||
|     const short *vp8_filter); |  | ||||||
|  |  | ||||||
| // 8x8 |  | ||||||
| extern void vp8_filter_block2d_first_pass_8x8_armv6( |  | ||||||
|     unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line, |  | ||||||
|     unsigned int output_width, unsigned int output_height, |  | ||||||
|     const short *vp8_filter); |  | ||||||
|  |  | ||||||
| // 16x16 |  | ||||||
| extern void vp8_filter_block2d_first_pass_16x16_armv6( |  | ||||||
|     unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line, |  | ||||||
|     unsigned int output_width, unsigned int output_height, |  | ||||||
|     const short *vp8_filter); |  | ||||||
|  |  | ||||||
| extern void vp8_filter_block2d_second_pass_armv6(short *src_ptr, |  | ||||||
|                                                  unsigned char *output_ptr, |  | ||||||
|                                                  unsigned int output_pitch, |  | ||||||
|                                                  unsigned int cnt, |  | ||||||
|                                                  const short *vp8_filter); |  | ||||||
|  |  | ||||||
| extern void vp8_filter4_block2d_second_pass_armv6(short *src_ptr, |  | ||||||
|                                                   unsigned char *output_ptr, |  | ||||||
|                                                   unsigned int output_pitch, |  | ||||||
|                                                   unsigned int cnt, |  | ||||||
|                                                   const short *vp8_filter); |  | ||||||
|  |  | ||||||
| extern void vp8_filter_block2d_first_pass_only_armv6( |  | ||||||
|     unsigned char *src_ptr, unsigned char *output_ptr, |  | ||||||
|     unsigned int src_pixels_per_line, unsigned int cnt, |  | ||||||
|     unsigned int output_pitch, const short *vp8_filter); |  | ||||||
|  |  | ||||||
| extern void vp8_filter_block2d_second_pass_only_armv6( |  | ||||||
|     unsigned char *src_ptr, unsigned char *output_ptr, |  | ||||||
|     unsigned int src_pixels_per_line, unsigned int cnt, |  | ||||||
|     unsigned int output_pitch, const short *vp8_filter); |  | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
| void vp8_sixtap_predict4x4_armv6(unsigned char *src_ptr, |  | ||||||
|                                  int src_pixels_per_line, int xoffset, |  | ||||||
|                                  int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                  int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|   DECLARE_ALIGNED(4, short, |  | ||||||
|                   FData[12 * 4]); /* Temp data buffer used in filtering */ |  | ||||||
|  |  | ||||||
|   HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ |  | ||||||
|   VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ |  | ||||||
|  |  | ||||||
|   /* Vfilter is null. First pass only */ |  | ||||||
|   if (xoffset && !yoffset) { |  | ||||||
|     /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, |  | ||||||
|     src_pixels_per_line, 4, 4, HFilter ); |  | ||||||
|     vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, |  | ||||||
|     VFilter );*/ |  | ||||||
|  |  | ||||||
|     vp8_filter_block2d_first_pass_only_armv6( |  | ||||||
|         src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); |  | ||||||
|   } |  | ||||||
|   /* Hfilter is null. Second pass only */ |  | ||||||
|   else if (!xoffset && yoffset) { |  | ||||||
|     vp8_filter_block2d_second_pass_only_armv6( |  | ||||||
|         src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); |  | ||||||
|   } else { |  | ||||||
|     /* Vfilter is a 4 tap filter */ |  | ||||||
|     if (yoffset & 0x1) { |  | ||||||
|       vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, |  | ||||||
|                                           FData + 1, src_pixels_per_line, 4, 7, |  | ||||||
|                                           HFilter); |  | ||||||
|       vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, |  | ||||||
|                                             VFilter); |  | ||||||
|     } |  | ||||||
|     /* Vfilter is 6 tap filter */ |  | ||||||
|     else { |  | ||||||
|       vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), |  | ||||||
|                                           FData, src_pixels_per_line, 4, 9, |  | ||||||
|                                           HFilter); |  | ||||||
|       vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, |  | ||||||
|                                            VFilter); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_sixtap_predict8x8_armv6(unsigned char *src_ptr, |  | ||||||
|                                  int src_pixels_per_line, int xoffset, |  | ||||||
|                                  int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                  int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|   DECLARE_ALIGNED(4, short, |  | ||||||
|                   FData[16 * 8]); /* Temp data buffer used in filtering */ |  | ||||||
|  |  | ||||||
|   HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ |  | ||||||
|   VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ |  | ||||||
|  |  | ||||||
|   if (xoffset && !yoffset) { |  | ||||||
|     vp8_filter_block2d_first_pass_only_armv6( |  | ||||||
|         src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); |  | ||||||
|   } |  | ||||||
|   /* Hfilter is null. Second pass only */ |  | ||||||
|   else if (!xoffset && yoffset) { |  | ||||||
|     vp8_filter_block2d_second_pass_only_armv6( |  | ||||||
|         src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); |  | ||||||
|   } else { |  | ||||||
|     if (yoffset & 0x1) { |  | ||||||
|       vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, |  | ||||||
|                                               FData + 1, src_pixels_per_line, 8, |  | ||||||
|                                               11, HFilter); |  | ||||||
|       vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, |  | ||||||
|                                             VFilter); |  | ||||||
|     } else { |  | ||||||
|       vp8_filter_block2d_first_pass_8x8_armv6( |  | ||||||
|           src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, |  | ||||||
|           13, HFilter); |  | ||||||
|       vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, |  | ||||||
|                                            VFilter); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_sixtap_predict16x16_armv6(unsigned char *src_ptr, |  | ||||||
|                                    int src_pixels_per_line, int xoffset, |  | ||||||
|                                    int yoffset, unsigned char *dst_ptr, |  | ||||||
|                                    int dst_pitch) { |  | ||||||
|   const short *HFilter; |  | ||||||
|   const short *VFilter; |  | ||||||
|   DECLARE_ALIGNED(4, short, |  | ||||||
|                   FData[24 * 16]); /* Temp data buffer used in filtering */ |  | ||||||
|  |  | ||||||
|   HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ |  | ||||||
|   VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ |  | ||||||
|  |  | ||||||
|   if (xoffset && !yoffset) { |  | ||||||
|     vp8_filter_block2d_first_pass_only_armv6( |  | ||||||
|         src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); |  | ||||||
|   } |  | ||||||
|   /* Hfilter is null. Second pass only */ |  | ||||||
|   else if (!xoffset && yoffset) { |  | ||||||
|     vp8_filter_block2d_second_pass_only_armv6( |  | ||||||
|         src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); |  | ||||||
|   } else { |  | ||||||
|     if (yoffset & 0x1) { |  | ||||||
|       vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, |  | ||||||
|                                                 FData + 1, src_pixels_per_line, |  | ||||||
|                                                 16, 19, HFilter); |  | ||||||
|       vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, |  | ||||||
|                                             VFilter); |  | ||||||
|     } else { |  | ||||||
|       vp8_filter_block2d_first_pass_16x16_armv6( |  | ||||||
|           src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, |  | ||||||
|           21, HFilter); |  | ||||||
|       vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, |  | ||||||
|                                            VFilter); |  | ||||||
|     } |  | ||||||
|   } |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
| @@ -13,18 +13,6 @@ | |||||||
| #include "vp8/common/loopfilter.h" | #include "vp8/common/loopfilter.h" | ||||||
| #include "vp8/common/onyxc_int.h" | #include "vp8/common/onyxc_int.h" | ||||||
|  |  | ||||||
| #define prototype_loopfilter(sym)                                      \ |  | ||||||
|   void sym(unsigned char *src, int pitch, const unsigned char *blimit, \ |  | ||||||
|            const unsigned char *limit, const unsigned char *thresh, int count) |  | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
| extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); |  | ||||||
| extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); |  | ||||||
| extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); |  | ||||||
| extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if HAVE_NEON |  | ||||||
| typedef void loopfilter_y_neon(unsigned char *src, int pitch, | typedef void loopfilter_y_neon(unsigned char *src, int pitch, | ||||||
|                                unsigned char blimit, unsigned char limit, |                                unsigned char blimit, unsigned char limit, | ||||||
|                                unsigned char thresh); |                                unsigned char thresh); | ||||||
| @@ -41,101 +29,7 @@ extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; | |||||||
| extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; | extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; | ||||||
| extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; | extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; | ||||||
| extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; | extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
| /* ARMV6/MEDIA loopfilter functions*/ |  | ||||||
| /* Horizontal MB filtering */ |  | ||||||
| void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, |  | ||||||
|                                unsigned char *v_ptr, int y_stride, |  | ||||||
|                                int uv_stride, loop_filter_info *lfi) { |  | ||||||
|   vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, |  | ||||||
|                                           lfi->hev_thr, 2); |  | ||||||
|  |  | ||||||
|   if (u_ptr) |  | ||||||
|     vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, |  | ||||||
|                                             lfi->lim, lfi->hev_thr, 1); |  | ||||||
|  |  | ||||||
|   if (v_ptr) |  | ||||||
|     vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, |  | ||||||
|                                             lfi->lim, lfi->hev_thr, 1); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /* Vertical MB Filtering */ |  | ||||||
| void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, |  | ||||||
|                                unsigned char *v_ptr, int y_stride, |  | ||||||
|                                int uv_stride, loop_filter_info *lfi) { |  | ||||||
|   vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, |  | ||||||
|                                         lfi->hev_thr, 2); |  | ||||||
|  |  | ||||||
|   if (u_ptr) |  | ||||||
|     vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, |  | ||||||
|                                           lfi->lim, lfi->hev_thr, 1); |  | ||||||
|  |  | ||||||
|   if (v_ptr) |  | ||||||
|     vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, |  | ||||||
|                                           lfi->lim, lfi->hev_thr, 1); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /* Horizontal B Filtering */ |  | ||||||
| void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, |  | ||||||
|                               unsigned char *v_ptr, int y_stride, int uv_stride, |  | ||||||
|                               loop_filter_info *lfi) { |  | ||||||
|   vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, |  | ||||||
|                                         lfi->blim, lfi->lim, lfi->hev_thr, 2); |  | ||||||
|   vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, |  | ||||||
|                                         lfi->blim, lfi->lim, lfi->hev_thr, 2); |  | ||||||
|   vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, |  | ||||||
|                                         lfi->blim, lfi->lim, lfi->hev_thr, 2); |  | ||||||
|  |  | ||||||
|   if (u_ptr) |  | ||||||
|     vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, |  | ||||||
|                                           lfi->blim, lfi->lim, lfi->hev_thr, 1); |  | ||||||
|  |  | ||||||
|   if (v_ptr) |  | ||||||
|     vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, |  | ||||||
|                                           lfi->blim, lfi->lim, lfi->hev_thr, 1); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, |  | ||||||
|                                const unsigned char *blimit) { |  | ||||||
|   vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, |  | ||||||
|                                                blimit); |  | ||||||
|   vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, |  | ||||||
|                                                blimit); |  | ||||||
|   vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, |  | ||||||
|                                                blimit); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /* Vertical B Filtering */ |  | ||||||
| void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, |  | ||||||
|                               unsigned char *v_ptr, int y_stride, int uv_stride, |  | ||||||
|                               loop_filter_info *lfi) { |  | ||||||
|   vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, |  | ||||||
|                                       lfi->hev_thr, 2); |  | ||||||
|   vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, |  | ||||||
|                                       lfi->hev_thr, 2); |  | ||||||
|   vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, |  | ||||||
|                                       lfi->hev_thr, 2); |  | ||||||
|  |  | ||||||
|   if (u_ptr) |  | ||||||
|     vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, |  | ||||||
|                                         lfi->lim, lfi->hev_thr, 1); |  | ||||||
|  |  | ||||||
|   if (v_ptr) |  | ||||||
|     vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, |  | ||||||
|                                         lfi->lim, lfi->hev_thr, 1); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, |  | ||||||
|                                const unsigned char *blimit) { |  | ||||||
|   vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); |  | ||||||
|   vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); |  | ||||||
|   vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); |  | ||||||
| } |  | ||||||
| #endif |  | ||||||
|  |  | ||||||
| #if HAVE_NEON |  | ||||||
| /* NEON loopfilter functions */ | /* NEON loopfilter functions */ | ||||||
| /* Horizontal MB filtering */ | /* Horizontal MB filtering */ | ||||||
| void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, | void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, | ||||||
| @@ -205,4 +99,3 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, | |||||||
|     vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, |     vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, | ||||||
|                                           hev_thr, v_ptr + 4); |                                           hev_thr, v_ptr + 4); | ||||||
| } | } | ||||||
| #endif |  | ||||||
|   | |||||||
| @@ -29,81 +29,69 @@ $vp8_clear_system_state_mmx=vpx_reset_mmx_state; | |||||||
| # Dequant | # Dequant | ||||||
| # | # | ||||||
| add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc"; | add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc"; | ||||||
| specialize qw/vp8_dequantize_b mmx media neon msa/; | specialize qw/vp8_dequantize_b mmx neon msa/; | ||||||
| $vp8_dequantize_b_media=vp8_dequantize_b_v6; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride"; | add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride"; | ||||||
| specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/; | specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/; | ||||||
| $vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6; |  | ||||||
| $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2; | $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; | add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; | ||||||
| specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6; |  | ||||||
| $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; | $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; | add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; | ||||||
| specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6; |  | ||||||
| $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; | $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; | ||||||
|  |  | ||||||
| # | # | ||||||
| # Loopfilter | # Loopfilter | ||||||
| # | # | ||||||
| add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | ||||||
| specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6; |  | ||||||
| $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2; | $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | ||||||
| specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6; |  | ||||||
| $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2; | $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | ||||||
| specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6; |  | ||||||
| $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2; | $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; | ||||||
| specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6; |  | ||||||
| $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2; | $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2; | ||||||
|  |  | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; | add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; | ||||||
| specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/; | specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/; | ||||||
| $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; | $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; | ||||||
| $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx; | $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx; | ||||||
| $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; | $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; | ||||||
| $vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6; |  | ||||||
| $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; | $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; | ||||||
| $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa; | $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; | add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; | ||||||
| specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/; | specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/; | ||||||
| $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c; | $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c; | ||||||
| $vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx; | $vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx; | ||||||
| $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2; | $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2; | ||||||
| $vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6; |  | ||||||
| $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; | $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; | ||||||
| $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa; | $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; | add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; | ||||||
| specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/; | specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/; | ||||||
| $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; | $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; | ||||||
| $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx; | $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx; | ||||||
| $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; | $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; | ||||||
| $vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6; |  | ||||||
| $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; | $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; | ||||||
| $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa; | $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; | add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; | ||||||
| specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/; | specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/; | ||||||
| $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c; | $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c; | ||||||
| $vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx; | $vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx; | ||||||
| $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2; | $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2; | ||||||
| $vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6; |  | ||||||
| $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon; | $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon; | ||||||
| $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa; | $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa; | ||||||
|  |  | ||||||
| @@ -112,8 +100,7 @@ $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa; | |||||||
| # | # | ||||||
| #idct16 | #idct16 | ||||||
| add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"; | add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"; | ||||||
| specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/; | specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/; | ||||||
| $vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual; |  | ||||||
| $vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2; | $vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2; | ||||||
|  |  | ||||||
| #iwalsh1 | #iwalsh1 | ||||||
| @@ -124,32 +111,27 @@ $vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2; | |||||||
|  |  | ||||||
| #iwalsh16 | #iwalsh16 | ||||||
| add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output"; | add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output"; | ||||||
| specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6; |  | ||||||
| $vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2; | $vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2; | ||||||
|  |  | ||||||
| #idct1_scalar_add | #idct1_scalar_add | ||||||
| add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"; | add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"; | ||||||
| specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2 msa/; | specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/; | ||||||
| $vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6; |  | ||||||
| $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2; | $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2; | ||||||
|  |  | ||||||
| # | # | ||||||
| # RECON | # RECON | ||||||
| # | # | ||||||
| add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/; | specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/; | ||||||
| $vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6; |  | ||||||
| $vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2; | $vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/; | specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/; | ||||||
| $vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6; |  | ||||||
| $vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2; | $vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/; | specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/; | ||||||
| $vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; |  | ||||||
| $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; | $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; | ||||||
|  |  | ||||||
| # | # | ||||||
| @@ -180,40 +162,36 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { | |||||||
| # Subpixel | # Subpixel | ||||||
| # | # | ||||||
| add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 msa/; | specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/; | ||||||
| $vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6; |  | ||||||
| $vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2; | $vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 msa/; | specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/; | ||||||
| $vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6; |  | ||||||
| $vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2; | $vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 msa/; | specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/; | ||||||
| $vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6; |  | ||||||
| $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; | $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; | ||||||
|  |  | ||||||
|  | # TODO(johannkoenig): Add neon implementation | ||||||
|  | # https://bugs.chromium.org/p/webm/issues/detail?id=1273 | ||||||
| add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/; | specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/; | ||||||
| $vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6; |  | ||||||
| $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; | $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; | ||||||
|  |  | ||||||
| add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/; | specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/; | ||||||
| $vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/; | specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/; | ||||||
| $vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/; | specialize qw/vp8_bilinear_predict8x4 mmx neon msa/; | ||||||
| $vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6; |  | ||||||
|  |  | ||||||
|  | # TODO(johannkoenig): Add neon implementation | ||||||
|  | # https://bugs.chromium.org/p/webm/issues/detail?id=1273 | ||||||
| add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; | ||||||
| specialize qw/vp8_bilinear_predict4x4 mmx media msa/; | specialize qw/vp8_bilinear_predict4x4 mmx msa/; | ||||||
| $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # Encoder functions below this point. | # Encoder functions below this point. | ||||||
| @@ -232,16 +210,13 @@ if ($opts{arch} =~ /x86/) { | |||||||
| # Forward DCT | # Forward DCT | ||||||
| # | # | ||||||
| add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; | add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; | ||||||
| specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/; | specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/; | ||||||
| $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; | add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; | ||||||
| specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/; | specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/; | ||||||
| $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6; |  | ||||||
|  |  | ||||||
| add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; | add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; | ||||||
| specialize qw/vp8_short_walsh4x4 sse2 media neon msa/; | specialize qw/vp8_short_walsh4x4 sse2 neon msa/; | ||||||
| $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6; |  | ||||||
|  |  | ||||||
| # | # | ||||||
| # Quantizer | # Quantizer | ||||||
|   | |||||||
| @@ -1,262 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|     EXPORT |vp8_short_fdct4x4_armv6| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY |  | ||||||
| ; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) |  | ||||||
| |vp8_short_fdct4x4_armv6| PROC |  | ||||||
|  |  | ||||||
|     stmfd       sp!, {r4 - r12, lr} |  | ||||||
|  |  | ||||||
|     ; PART 1 |  | ||||||
|  |  | ||||||
|     ; coeffs 0-3 |  | ||||||
|     ldrd        r4, r5, [r0]        ; [i1 | i0] [i3 | i2] |  | ||||||
|  |  | ||||||
|     ldr         r10, c7500 |  | ||||||
|     ldr         r11, c14500 |  | ||||||
|     ldr         r12, c0x22a453a0    ; [2217*4 | 5352*4] |  | ||||||
|     ldr         lr, c0x00080008 |  | ||||||
|     ror         r5, r5, #16         ; [i2 | i3] |  | ||||||
|  |  | ||||||
|     qadd16      r6, r4, r5          ; [i1+i2 | i0+i3] = [b1 | a1] without shift |  | ||||||
|     qsub16      r7, r4, r5          ; [i1-i2 | i0-i3] = [c1 | d1] without shift |  | ||||||
|  |  | ||||||
|     add         r0, r0, r2          ; update input pointer |  | ||||||
|  |  | ||||||
|     qadd16      r7, r7, r7          ; 2*[c1|d1] --> we can use smlad and smlsd |  | ||||||
|                                     ; with 2217*4 and 5352*4 without losing the |  | ||||||
|                                     ; sign bit (overflow) |  | ||||||
|  |  | ||||||
|     smuad       r4, r6, lr          ; o0 = (i1+i2)*8 + (i0+i3)*8 |  | ||||||
|     smusd       r5, r6, lr          ; o2 = (i1+i2)*8 - (i0+i3)*8 |  | ||||||
|  |  | ||||||
|     smlad       r6, r7, r12, r11    ; o1 = (c1 * 2217 + d1 * 5352 +  14500) |  | ||||||
|     smlsdx      r7, r7, r12, r10    ; o3 = (d1 * 2217 - c1 * 5352 +   7500) |  | ||||||
|  |  | ||||||
|     ldrd        r8, r9, [r0]        ; [i5 | i4] [i7 | i6] |  | ||||||
|  |  | ||||||
|     pkhbt       r3, r4, r6, lsl #4  ; [o1 | o0], keep in register for PART 2 |  | ||||||
|     pkhbt       r6, r5, r7, lsl #4  ; [o3 | o2] |  | ||||||
|  |  | ||||||
|     str         r6, [r1, #4] |  | ||||||
|  |  | ||||||
|     ; coeffs 4-7 |  | ||||||
|     ror         r9, r9, #16         ; [i6 | i7] |  | ||||||
|  |  | ||||||
|     qadd16      r6, r8, r9          ; [i5+i6 | i4+i7] = [b1 | a1] without shift |  | ||||||
|     qsub16      r7, r8, r9          ; [i5-i6 | i4-i7] = [c1 | d1] without shift |  | ||||||
|  |  | ||||||
|     add         r0, r0, r2          ; update input pointer |  | ||||||
|  |  | ||||||
|     qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd |  | ||||||
|                                     ; with 2217*4 and 5352*4 without losing the |  | ||||||
|                                     ; sign bit (overflow) |  | ||||||
|  |  | ||||||
|     smuad       r9, r6, lr          ; o4 = (i5+i6)*8 + (i4+i7)*8 |  | ||||||
|     smusd       r8, r6, lr          ; o6 = (i5+i6)*8 - (i4+i7)*8 |  | ||||||
|  |  | ||||||
|     smlad       r6, r7, r12, r11    ; o5 = (c1 * 2217 + d1 * 5352 +  14500) |  | ||||||
|     smlsdx      r7, r7, r12, r10    ; o7 = (d1 * 2217 - c1 * 5352 +   7500) |  | ||||||
|  |  | ||||||
|     ldrd        r4, r5, [r0]        ; [i9 | i8] [i11 | i10] |  | ||||||
|  |  | ||||||
|     pkhbt       r9, r9, r6, lsl #4  ; [o5 | o4], keep in register for PART 2 |  | ||||||
|     pkhbt       r6, r8, r7, lsl #4  ; [o7 | o6] |  | ||||||
|  |  | ||||||
|     str         r6, [r1, #12] |  | ||||||
|  |  | ||||||
|     ; coeffs 8-11 |  | ||||||
|     ror         r5, r5, #16         ; [i10 | i11] |  | ||||||
|  |  | ||||||
|     qadd16      r6, r4, r5          ; [i9+i10 | i8+i11]=[b1 | a1] without shift |  | ||||||
|     qsub16      r7, r4, r5          ; [i9-i10 | i8-i11]=[c1 | d1] without shift |  | ||||||
|  |  | ||||||
|     add         r0, r0, r2          ; update input pointer |  | ||||||
|  |  | ||||||
|     qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd |  | ||||||
|                                     ; with 2217*4 and 5352*4 without losing the |  | ||||||
|                                     ; sign bit (overflow) |  | ||||||
|  |  | ||||||
|     smuad       r2, r6, lr          ; o8 = (i9+i10)*8 + (i8+i11)*8 |  | ||||||
|     smusd       r8, r6, lr          ; o10 = (i9+i10)*8 - (i8+i11)*8 |  | ||||||
|  |  | ||||||
|     smlad       r6, r7, r12, r11    ; o9 = (c1 * 2217 + d1 * 5352 +  14500) |  | ||||||
|     smlsdx      r7, r7, r12, r10    ; o11 = (d1 * 2217 - c1 * 5352 +   7500) |  | ||||||
|  |  | ||||||
|     ldrd        r4, r5, [r0]        ; [i13 | i12] [i15 | i14] |  | ||||||
|  |  | ||||||
|     pkhbt       r2, r2, r6, lsl #4  ; [o9 | o8], keep in register for PART 2 |  | ||||||
|     pkhbt       r6, r8, r7, lsl #4  ; [o11 | o10] |  | ||||||
|  |  | ||||||
|     str         r6, [r1, #20] |  | ||||||
|  |  | ||||||
|     ; coeffs 12-15 |  | ||||||
|     ror         r5, r5, #16         ; [i14 | i15] |  | ||||||
|  |  | ||||||
|     qadd16      r6, r4, r5          ; [i13+i14 | i12+i15]=[b1|a1] without shift |  | ||||||
|     qsub16      r7, r4, r5          ; [i13-i14 | i12-i15]=[c1|d1] without shift |  | ||||||
|  |  | ||||||
|     qadd16      r7, r7, r7          ; 2x[c1|d1] --> we can use smlad and smlsd |  | ||||||
|                                     ; with 2217*4 and 5352*4 without losing the |  | ||||||
|                                     ; sign bit (overflow) |  | ||||||
|  |  | ||||||
|     smuad       r4, r6, lr          ; o12 = (i13+i14)*8 + (i12+i15)*8 |  | ||||||
|     smusd       r5, r6, lr          ; o14 = (i13+i14)*8 - (i12+i15)*8 |  | ||||||
|  |  | ||||||
|     smlad       r6, r7, r12, r11    ; o13 = (c1 * 2217 + d1 * 5352 +  14500) |  | ||||||
|     smlsdx      r7, r7, r12, r10    ; o15 = (d1 * 2217 - c1 * 5352 +   7500) |  | ||||||
|  |  | ||||||
|     pkhbt       r0, r4, r6, lsl #4  ; [o13 | o12], keep in register for PART 2 |  | ||||||
|     pkhbt       r6, r5, r7, lsl #4  ; [o15 | o14] |  | ||||||
|  |  | ||||||
|     str         r6, [r1, #28] |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ; PART 2 ------------------------------------------------- |  | ||||||
|     ldr         r11, c12000 |  | ||||||
|     ldr         r10, c51000 |  | ||||||
|     ldr         lr, c0x00070007 |  | ||||||
|  |  | ||||||
|     qadd16      r4, r3, r0          ; a1 = [i1+i13 | i0+i12] |  | ||||||
|     qadd16      r5, r9, r2          ; b1 = [i5+i9  |  i4+i8] |  | ||||||
|     qsub16      r6, r9, r2          ; c1 = [i5-i9  |  i4-i8] |  | ||||||
|     qsub16      r7, r3, r0          ; d1 = [i1-i13 | i0-i12] |  | ||||||
|  |  | ||||||
|     qadd16      r4, r4, lr          ; a1 + 7 |  | ||||||
|  |  | ||||||
|     add         r0, r11, #0x10000   ; add (d!=0) |  | ||||||
|  |  | ||||||
|     qadd16      r2, r4, r5          ; a1 + b1 + 7 |  | ||||||
|     qsub16      r3, r4, r5          ; a1 - b1 + 7 |  | ||||||
|  |  | ||||||
|     ldr         r12, c0x08a914e8    ; [2217 | 5352] |  | ||||||
|  |  | ||||||
|     lsl         r8, r2, #16         ; prepare bottom halfword for scaling |  | ||||||
|     asr         r2, r2, #4          ; scale top halfword |  | ||||||
|     lsl         r9, r3, #16         ; prepare bottom halfword for scaling |  | ||||||
|     asr         r3, r3, #4          ; scale top halfword |  | ||||||
|     pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword |  | ||||||
|     pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword |  | ||||||
|  |  | ||||||
|     smulbt      r2, r6, r12         ; [ ------ | c1*2217] |  | ||||||
|     str         r4, [r1, #0]        ; [     o1 |      o0] |  | ||||||
|     smultt      r3, r6, r12         ; [c1*2217 | ------ ] |  | ||||||
|     str         r5, [r1, #16]       ; [     o9 |      o8] |  | ||||||
|  |  | ||||||
|     smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352] |  | ||||||
|     smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ] |  | ||||||
|  |  | ||||||
|     smulbb      r2, r6, r12         ; [ ------ | c1*5352] |  | ||||||
|     smultb      r3, r6, r12         ; [c1*5352 | ------ ] |  | ||||||
|  |  | ||||||
|     lsls        r6, r7, #16         ; d1 != 0 ? |  | ||||||
|     addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0) |  | ||||||
|     addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0) |  | ||||||
|     asrs        r6, r7, #16 |  | ||||||
|     addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0) |  | ||||||
|     addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0) |  | ||||||
|  |  | ||||||
|     smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000 |  | ||||||
|     smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000 |  | ||||||
|  |  | ||||||
|     pkhtb       r9, r9, r8, asr #16 |  | ||||||
|  |  | ||||||
|     sub         r4, r4, r2 |  | ||||||
|     sub         r5, r5, r3 |  | ||||||
|  |  | ||||||
|     ldr         r3, [r1, #4]        ; [i3 | i2] |  | ||||||
|  |  | ||||||
|     pkhtb       r5, r5, r4, asr #16 ; [o13|o12] |  | ||||||
|  |  | ||||||
|     str         r9, [r1, #8]        ; [o5 | 04] |  | ||||||
|  |  | ||||||
|     ldr         r9, [r1, #12]       ; [i7 | i6] |  | ||||||
|     ldr         r8, [r1, #28]       ; [i15|i14] |  | ||||||
|     ldr         r2, [r1, #20]       ; [i11|i10] |  | ||||||
|     str         r5, [r1, #24]       ; [o13|o12] |  | ||||||
|  |  | ||||||
|     qadd16      r4, r3, r8          ; a1 = [i3+i15 | i2+i14] |  | ||||||
|     qadd16      r5, r9, r2          ; b1 = [i7+i11 | i6+i10] |  | ||||||
|  |  | ||||||
|     qadd16      r4, r4, lr          ; a1 + 7 |  | ||||||
|  |  | ||||||
|     qsub16      r6, r9, r2          ; c1 = [i7-i11 | i6-i10] |  | ||||||
|     qadd16      r2, r4, r5          ; a1 + b1 + 7 |  | ||||||
|     qsub16      r7, r3, r8          ; d1 = [i3-i15 | i2-i14] |  | ||||||
|     qsub16      r3, r4, r5          ; a1 - b1 + 7 |  | ||||||
|  |  | ||||||
|     lsl         r8, r2, #16         ; prepare bottom halfword for scaling |  | ||||||
|     asr         r2, r2, #4          ; scale top halfword |  | ||||||
|     lsl         r9, r3, #16         ; prepare bottom halfword for scaling |  | ||||||
|     asr         r3, r3, #4          ; scale top halfword |  | ||||||
|     pkhtb       r4, r2, r8, asr #20 ; pack and scale bottom halfword |  | ||||||
|     pkhtb       r5, r3, r9, asr #20 ; pack and scale bottom halfword |  | ||||||
|  |  | ||||||
|     smulbt      r2, r6, r12         ; [ ------ | c1*2217] |  | ||||||
|     str         r4, [r1, #4]        ; [     o3 |      o2] |  | ||||||
|     smultt      r3, r6, r12         ; [c1*2217 | ------ ] |  | ||||||
|     str         r5, [r1, #20]       ; [    o11 |     o10] |  | ||||||
|  |  | ||||||
|     smlabb      r8, r7, r12, r2     ; [ ------ | d1*5352] |  | ||||||
|     smlatb      r9, r7, r12, r3     ; [d1*5352 | ------ ] |  | ||||||
|  |  | ||||||
|     smulbb      r2, r6, r12         ; [ ------ | c1*5352] |  | ||||||
|     smultb      r3, r6, r12         ; [c1*5352 | ------ ] |  | ||||||
|  |  | ||||||
|     lsls        r6, r7, #16         ; d1 != 0 ? |  | ||||||
|     addeq       r8, r8, r11         ; c1_b*2217+d1_b*5352+12000 + (d==0) |  | ||||||
|     addne       r8, r8, r0          ; c1_b*2217+d1_b*5352+12000 + (d!=0) |  | ||||||
|  |  | ||||||
|     asrs        r6, r7, #16 |  | ||||||
|     addeq       r9, r9, r11         ; c1_t*2217+d1_t*5352+12000 + (d==0) |  | ||||||
|     addne       r9, r9, r0          ; c1_t*2217+d1_t*5352+12000 + (d!=0) |  | ||||||
|  |  | ||||||
|     smlabt      r4, r7, r12, r10    ; [ ------ | d1*2217] + 51000 |  | ||||||
|     smlatt      r5, r7, r12, r10    ; [d1*2217 | ------ ] + 51000 |  | ||||||
|  |  | ||||||
|     pkhtb       r9, r9, r8, asr #16 |  | ||||||
|  |  | ||||||
|     sub         r4, r4, r2 |  | ||||||
|     sub         r5, r5, r3 |  | ||||||
|  |  | ||||||
|     str         r9, [r1, #12]       ; [o7 | o6] |  | ||||||
|     pkhtb       r5, r5, r4, asr #16 ; [o15|o14] |  | ||||||
|  |  | ||||||
|     str         r5, [r1, #28]       ; [o15|o14] |  | ||||||
|  |  | ||||||
|     ldmfd       sp!, {r4 - r12, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ; Used constants |  | ||||||
| c7500 |  | ||||||
|     DCD     7500 |  | ||||||
| c14500 |  | ||||||
|     DCD     14500 |  | ||||||
| c0x22a453a0 |  | ||||||
|     DCD     0x22a453a0 |  | ||||||
| c0x00080008 |  | ||||||
|     DCD     0x00080008 |  | ||||||
| c12000 |  | ||||||
|     DCD     12000 |  | ||||||
| c51000 |  | ||||||
|     DCD     51000 |  | ||||||
| c0x00070007 |  | ||||||
|     DCD     0x00070007 |  | ||||||
| c0x08a914e8 |  | ||||||
|     DCD     0x08a914e8 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,212 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|     EXPORT |vp8_short_walsh4x4_armv6| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
|  |  | ||||||
| ;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) |  | ||||||
| ; r0    short *input, |  | ||||||
| ; r1    short *output, |  | ||||||
| ; r2    int pitch |  | ||||||
| |vp8_short_walsh4x4_armv6| PROC |  | ||||||
|  |  | ||||||
|     stmdb       sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldrd        r4, r5, [r0], r2 |  | ||||||
|     ldr         lr, c00040004 |  | ||||||
|     ldrd        r6, r7, [r0], r2 |  | ||||||
|  |  | ||||||
|     ; 0-3 |  | ||||||
|     qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2] |  | ||||||
|     qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2] |  | ||||||
|  |  | ||||||
|     ldrd        r8, r9, [r0], r2 |  | ||||||
|     ; 4-7 |  | ||||||
|     qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6] |  | ||||||
|     qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6] |  | ||||||
|  |  | ||||||
|     ldrd        r10, r11, [r0] |  | ||||||
|     ; 8-11 |  | ||||||
|     qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10] |  | ||||||
|     qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10] |  | ||||||
|  |  | ||||||
|     ; 12-15 |  | ||||||
|     qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14] |  | ||||||
|     qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14] |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     lsls        r2, r3, #16 |  | ||||||
|     smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2 |  | ||||||
|     addne       r11, r11, #1        ; A0 += (a1!=0) |  | ||||||
|  |  | ||||||
|     lsls        r2, r7, #16 |  | ||||||
|     smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2 |  | ||||||
|     addne       r12, r12, #1        ; C0 += (a1!=0) |  | ||||||
|  |  | ||||||
|     add         r0, r11, r12        ; a1_0 = A0 + C0 |  | ||||||
|     sub         r11, r11, r12       ; b1_0 = A0 - C0 |  | ||||||
|  |  | ||||||
|     lsls        r2, r5, #16 |  | ||||||
|     smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2 |  | ||||||
|     addne       r12, r12, #1        ; B0 += (a1!=0) |  | ||||||
|  |  | ||||||
|     lsls        r2, r9, #16 |  | ||||||
|     smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2 |  | ||||||
|     addne       r2, r2, #1          ; D0 += (a1!=0) |  | ||||||
|  |  | ||||||
|     add         lr, r12, r2         ; d1_0 = B0 + D0 |  | ||||||
|     sub         r12, r12, r2        ; c1_0 = B0 - D0 |  | ||||||
|  |  | ||||||
|     ; op[0,4,8,12] |  | ||||||
|     adds        r2, r0, lr          ; a2 = a1_0 + d1_0 |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     subs        r0, r0, lr          ; d2 = a1_0 - d1_0 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1]            ; op[0] |  | ||||||
|  |  | ||||||
|     addmi       r0, r0, #1          ; += a2 < 0 |  | ||||||
|     add         r0, r0, #3          ; += 3 |  | ||||||
|     ldr         lr, c00040004 |  | ||||||
|     mov         r0, r0, asr #3      ; >> 3 |  | ||||||
|     strh        r0, [r1, #24]       ; op[12] |  | ||||||
|  |  | ||||||
|     adds        r2, r11, r12        ; b2 = b1_0 + c1_0 |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     subs        r0, r11, r12        ; c2 = b1_0 - c1_0 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #8]        ; op[4] |  | ||||||
|  |  | ||||||
|     addmi       r0, r0, #1          ; += a2 < 0 |  | ||||||
|     add         r0, r0, #3          ; += 3 |  | ||||||
|     smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2 |  | ||||||
|     smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2 |  | ||||||
|     mov         r0, r0, asr #3      ; >> 3 |  | ||||||
|     strh        r0, [r1, #16]       ; op[8] |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ; op[3,7,11,15] |  | ||||||
|     add         r0, r3, r7          ; a1_3 = A3 + C3 |  | ||||||
|     sub         r3, r3, r7          ; b1_3 = A3 - C3 |  | ||||||
|  |  | ||||||
|     smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2 |  | ||||||
|     smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2 |  | ||||||
|     add         r7, r5, r9          ; d1_3 = B3 + D3 |  | ||||||
|     sub         r5, r5, r9          ; c1_3 = B3 - D3 |  | ||||||
|  |  | ||||||
|     adds        r2, r0, r7          ; a2 = a1_3 + d1_3 |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     adds        r9, r3, r5          ; b2 = b1_3 + c1_3 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #6]        ; op[3] |  | ||||||
|  |  | ||||||
|     addmi       r9, r9, #1          ; += a2 < 0 |  | ||||||
|     add         r9, r9, #3          ; += 3 |  | ||||||
|     subs        r2, r3, r5          ; c2 = b1_3 - c1_3 |  | ||||||
|     mov         r9, r9, asr #3      ; >> 3 |  | ||||||
|     strh        r9, [r1, #14]       ; op[7] |  | ||||||
|  |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     subs        r9, r0, r7          ; d2 = a1_3 - d1_3 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #22]       ; op[11] |  | ||||||
|  |  | ||||||
|     addmi       r9, r9, #1          ; += a2 < 0 |  | ||||||
|     add         r9, r9, #3          ; += 3 |  | ||||||
|     smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2 |  | ||||||
|     smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2 |  | ||||||
|     mov         r9, r9, asr #3      ; >> 3 |  | ||||||
|     strh        r9, [r1, #30]       ; op[15] |  | ||||||
|  |  | ||||||
|     ; op[1,5,9,13] |  | ||||||
|     add         r0, r3, r5          ; a1_1 = A1 + C1 |  | ||||||
|     sub         r3, r3, r5          ; b1_1 = A1 - C1 |  | ||||||
|  |  | ||||||
|     smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2 |  | ||||||
|     smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2 |  | ||||||
|     add         r5, r7, r9          ; d1_1 = B1 + D1 |  | ||||||
|     sub         r7, r7, r9          ; c1_1 = B1 - D1 |  | ||||||
|  |  | ||||||
|     adds        r2, r0, r5          ; a2 = a1_1 + d1_1 |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     adds        r9, r3, r7          ; b2 = b1_1 + c1_1 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #2]        ; op[1] |  | ||||||
|  |  | ||||||
|     addmi       r9, r9, #1          ; += a2 < 0 |  | ||||||
|     add         r9, r9, #3          ; += 3 |  | ||||||
|     subs        r2, r3, r7          ; c2 = b1_1 - c1_1 |  | ||||||
|     mov         r9, r9, asr #3      ; >> 3 |  | ||||||
|     strh        r9, [r1, #10]       ; op[5] |  | ||||||
|  |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     subs        r9, r0, r5          ; d2 = a1_1 - d1_1 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #18]       ; op[9] |  | ||||||
|  |  | ||||||
|     addmi       r9, r9, #1          ; += a2 < 0 |  | ||||||
|     add         r9, r9, #3          ; += 3 |  | ||||||
|     smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2 |  | ||||||
|     smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2 |  | ||||||
|     mov         r9, r9, asr #3      ; >> 3 |  | ||||||
|     strh        r9, [r1, #26]       ; op[13] |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ; op[2,6,10,14] |  | ||||||
|     add         r11, r4, r8         ; a1_2 = A2 + C2 |  | ||||||
|     sub         r12, r4, r8         ; b1_2 = A2 - C2 |  | ||||||
|  |  | ||||||
|     smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2 |  | ||||||
|     smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2 |  | ||||||
|     add         r4, r6, r10         ; d1_2 = B2 + D2 |  | ||||||
|     sub         r8, r6, r10         ; c1_2 = B2 - D2 |  | ||||||
|  |  | ||||||
|     adds        r2, r11, r4         ; a2 = a1_2 + d1_2 |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     adds        r9, r12, r8         ; b2 = b1_2 + c1_2 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #4]        ; op[2] |  | ||||||
|  |  | ||||||
|     addmi       r9, r9, #1          ; += a2 < 0 |  | ||||||
|     add         r9, r9, #3          ; += 3 |  | ||||||
|     subs        r2, r12, r8         ; c2 = b1_2 - c1_2 |  | ||||||
|     mov         r9, r9, asr #3      ; >> 3 |  | ||||||
|     strh        r9, [r1, #12]       ; op[6] |  | ||||||
|  |  | ||||||
|     addmi       r2, r2, #1          ; += a2 < 0 |  | ||||||
|     add         r2, r2, #3          ; += 3 |  | ||||||
|     subs        r9, r11, r4         ; d2 = a1_2 - d1_2 |  | ||||||
|     mov         r2, r2, asr #3      ; >> 3 |  | ||||||
|     strh        r2, [r1, #20]       ; op[10] |  | ||||||
|  |  | ||||||
|     addmi       r9, r9, #1          ; += a2 < 0 |  | ||||||
|     add         r9, r9, #3          ; += 3 |  | ||||||
|     mov         r9, r9, asr #3      ; >> 3 |  | ||||||
|     strh        r9, [r1, #28]       ; op[14] |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     ldmia       sp!, {r4 - r11, pc} |  | ||||||
|     ENDP        ; |vp8_short_walsh4x4_armv6| |  | ||||||
|  |  | ||||||
| c00040004 |  | ||||||
|     DCD         0x00040004 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,21 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include "vpx_config.h" |  | ||||||
| #include "vp8_rtcd.h" |  | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
|  |  | ||||||
| void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch) { |  | ||||||
|   vp8_short_fdct4x4_armv6(input, output, pitch); |  | ||||||
|   vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #endif /* HAVE_MEDIA */ |  | ||||||
| @@ -123,30 +123,8 @@ ifeq ($(CONFIG_POSTPROC),yes) | |||||||
| VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c | VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c | ||||||
| endif | endif | ||||||
|  |  | ||||||
| # common (c) |  | ||||||
| VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c |  | ||||||
| VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c |  | ||||||
| VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c |  | ||||||
|  |  | ||||||
| # common (media) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.h |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/bilinearfilter_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x4_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem8x8_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/copymem16x16_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dc_only_idct_add_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/iwalsh_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/filter_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/loopfilter_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/simpleloopfilter_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/sixtappredict8x4_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM) |  | ||||||
| VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c |  | ||||||
|  |  | ||||||
| # common (neon intrinsics) | # common (neon intrinsics) | ||||||
|  | VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c | ||||||
| VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c | VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c | ||||||
| VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c | VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c | ||||||
| VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c | VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c | ||||||
|   | |||||||
| @@ -16,10 +16,6 @@ VP8_CX_SRCS-no  += $(VP8_COMMON_SRCS-no) | |||||||
| VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes) | VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes) | ||||||
| VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no) | VP8_CX_SRCS_REMOVE-no  += $(VP8_COMMON_SRCS_REMOVE-no) | ||||||
|  |  | ||||||
| ifeq ($(ARCH_ARM),yes) |  | ||||||
|   include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk |  | ||||||
| endif |  | ||||||
|  |  | ||||||
| VP8_CX_SRCS-yes += vp8cx.mk | VP8_CX_SRCS-yes += vp8cx.mk | ||||||
|  |  | ||||||
| VP8_CX_SRCS-yes += vp8_cx_iface.c | VP8_CX_SRCS-yes += vp8_cx_iface.c | ||||||
| @@ -101,6 +97,11 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes) | |||||||
| VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm | VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm | ||||||
| endif | endif | ||||||
|  |  | ||||||
|  | VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c | ||||||
|  | VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c | ||||||
|  | VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c | ||||||
|  | VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c | ||||||
|  |  | ||||||
| VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c | VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c | ||||||
| VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c | VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c | ||||||
| VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c | VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c | ||||||
|   | |||||||
| @@ -1,28 +0,0 @@ | |||||||
| ## |  | ||||||
| ##  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ## |  | ||||||
| ##  Use of this source code is governed by a BSD-style license |  | ||||||
| ##  that can be found in the LICENSE file in the root of the source |  | ||||||
| ##  tree. An additional intellectual property rights grant can be found |  | ||||||
| ##  in the file PATENTS.  All contributing project authors may |  | ||||||
| ##  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ## |  | ||||||
|  |  | ||||||
|  |  | ||||||
| VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk |  | ||||||
|  |  | ||||||
| #File list for arm |  | ||||||
| # encoder |  | ||||||
| VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c |  | ||||||
|  |  | ||||||
| #File list for media |  | ||||||
| # encoder |  | ||||||
| VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) |  | ||||||
| VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM) |  | ||||||
|  |  | ||||||
| #File list for neon |  | ||||||
| # encoder |  | ||||||
| VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c |  | ||||||
| VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c |  | ||||||
| VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c |  | ||||||
| VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c |  | ||||||
| @@ -1,237 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vpx_filter_block2d_bil_first_pass_media| |  | ||||||
|     EXPORT  |vpx_filter_block2d_bil_second_pass_media| |  | ||||||
|  |  | ||||||
|     AREA    |.text|, CODE, READONLY  ; name this block of code |  | ||||||
|  |  | ||||||
| ;------------------------------------- |  | ||||||
| ; r0    unsigned char  *src_ptr, |  | ||||||
| ; r1    unsigned short *dst_ptr, |  | ||||||
| ; r2    unsigned int    src_pitch, |  | ||||||
| ; r3    unsigned int    height, |  | ||||||
| ; stack unsigned int    width, |  | ||||||
| ; stack const short    *vpx_filter |  | ||||||
| ;------------------------------------- |  | ||||||
| ; The output is transposed stroed in output array to make it easy for second pass filtering. |  | ||||||
| |vpx_filter_block2d_bil_first_pass_media| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vpx_filter address |  | ||||||
|     ldr     r4, [sp, #36]                   ; width |  | ||||||
|  |  | ||||||
|     mov     r12, r3                         ; outer-loop counter |  | ||||||
|  |  | ||||||
|     add     r7, r2, r4                      ; preload next row |  | ||||||
|     pld     [r0, r7] |  | ||||||
|  |  | ||||||
|     sub     r2, r2, r4                      ; src increment for height loop |  | ||||||
|  |  | ||||||
|     ldr     r5, [r11]                       ; load up filter coefficients |  | ||||||
|  |  | ||||||
|     mov     r3, r3, lsl #1                  ; height*2 |  | ||||||
|     add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) |  | ||||||
|  |  | ||||||
|     mov     r11, r1                         ; save dst_ptr for each row |  | ||||||
|  |  | ||||||
|     cmp     r5, #128                        ; if filter coef = 128, then skip the filter |  | ||||||
|     beq     bil_null_1st_filter |  | ||||||
|  |  | ||||||
| |bil_height_loop_1st_v6| |  | ||||||
|     ldrb    r6, [r0]                        ; load source data |  | ||||||
|     ldrb    r7, [r0, #1] |  | ||||||
|     ldrb    r8, [r0, #2] |  | ||||||
|     mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter |  | ||||||
|  |  | ||||||
| |bil_width_loop_1st_v6| |  | ||||||
|     ldrb    r9, [r0, #3] |  | ||||||
|     ldrb    r10, [r0, #4] |  | ||||||
|  |  | ||||||
|     pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0] |  | ||||||
|     pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1] |  | ||||||
|  |  | ||||||
|     smuad   r6, r6, r5                      ; apply the filter |  | ||||||
|     pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2] |  | ||||||
|     smuad   r7, r7, r5 |  | ||||||
|     pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3] |  | ||||||
|  |  | ||||||
|     smuad   r8, r8, r5 |  | ||||||
|     smuad   r9, r9, r5 |  | ||||||
|  |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|  |  | ||||||
|     add     r6, r6, #0x40                   ; round_shift_and_clamp |  | ||||||
|     add     r7, r7, #0x40 |  | ||||||
|     usat    r6, #16, r6, asr #7 |  | ||||||
|     usat    r7, #16, r7, asr #7 |  | ||||||
|  |  | ||||||
|     strh    r6, [r1], r3                    ; result is transposed and stored |  | ||||||
|  |  | ||||||
|     add     r8, r8, #0x40                   ; round_shift_and_clamp |  | ||||||
|     strh    r7, [r1], r3 |  | ||||||
|     add     r9, r9, #0x40 |  | ||||||
|     usat    r8, #16, r8, asr #7 |  | ||||||
|     usat    r9, #16, r9, asr #7 |  | ||||||
|  |  | ||||||
|     strh    r8, [r1], r3                    ; result is transposed and stored |  | ||||||
|  |  | ||||||
|     ldrneb  r6, [r0]                        ; load source data |  | ||||||
|     strh    r9, [r1], r3 |  | ||||||
|  |  | ||||||
|     ldrneb  r7, [r0, #1] |  | ||||||
|     ldrneb  r8, [r0, #2] |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_1st_v6 |  | ||||||
|  |  | ||||||
|     add     r0, r0, r2                      ; move to next input row |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     add     r9, r2, r4, lsl #1              ; adding back block width |  | ||||||
|     pld     [r0, r9]                        ; preload next row |  | ||||||
|  |  | ||||||
|     add     r11, r11, #2                    ; move over to next column |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_1st_v6 |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
| |bil_null_1st_filter| |  | ||||||
| |bil_height_loop_null_1st| |  | ||||||
|     mov     lr, r4, lsr #2                  ; loop counter |  | ||||||
|  |  | ||||||
| |bil_width_loop_null_1st| |  | ||||||
|     ldrb    r6, [r0]                        ; load data |  | ||||||
|     ldrb    r7, [r0, #1] |  | ||||||
|     ldrb    r8, [r0, #2] |  | ||||||
|     ldrb    r9, [r0, #3] |  | ||||||
|  |  | ||||||
|     strh    r6, [r1], r3                    ; store it to immediate buffer |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     strh    r7, [r1], r3 |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|     strh    r8, [r1], r3 |  | ||||||
|     strh    r9, [r1], r3 |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_null_1st |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, r2                      ; move to next input line |  | ||||||
|     add     r11, r11, #2                    ; move over to next column |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_null_1st |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
|     ENDP  ; |vpx_filter_block2d_bil_first_pass_media| |  | ||||||
|  |  | ||||||
|  |  | ||||||
| ;--------------------------------- |  | ||||||
| ; r0    unsigned short *src_ptr, |  | ||||||
| ; r1    unsigned char  *dst_ptr, |  | ||||||
| ; r2    int             dst_pitch, |  | ||||||
| ; r3    unsigned int    height, |  | ||||||
| ; stack unsigned int    width, |  | ||||||
| ; stack const short    *vpx_filter |  | ||||||
| ;--------------------------------- |  | ||||||
| |vpx_filter_block2d_bil_second_pass_media| PROC |  | ||||||
|     stmdb   sp!, {r4 - r11, lr} |  | ||||||
|  |  | ||||||
|     ldr     r11, [sp, #40]                  ; vpx_filter address |  | ||||||
|     ldr     r4, [sp, #36]                   ; width |  | ||||||
|  |  | ||||||
|     ldr     r5, [r11]                       ; load up filter coefficients |  | ||||||
|     mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix |  | ||||||
|     mov     r11, r1 |  | ||||||
|  |  | ||||||
|     cmp     r5, #128                        ; if filter coef = 128, then skip the filter |  | ||||||
|     beq     bil_null_2nd_filter |  | ||||||
|  |  | ||||||
| |bil_height_loop_2nd| |  | ||||||
|     ldr     r6, [r0]                        ; load the data |  | ||||||
|     ldr     r8, [r0, #4] |  | ||||||
|     ldrh    r10, [r0, #8] |  | ||||||
|     mov     lr, r3, lsr #2                  ; loop counter |  | ||||||
|  |  | ||||||
| |bil_width_loop_2nd| |  | ||||||
|     pkhtb   r7, r6, r8                      ; src[1] | src[2] |  | ||||||
|     pkhtb   r9, r8, r10                     ; src[3] | src[4] |  | ||||||
|  |  | ||||||
|     smuad   r6, r6, r5                      ; apply filter |  | ||||||
|     smuad   r8, r8, r5                      ; apply filter |  | ||||||
|  |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|  |  | ||||||
|     smuadx  r7, r7, r5                      ; apply filter |  | ||||||
|     smuadx  r9, r9, r5                      ; apply filter |  | ||||||
|  |  | ||||||
|     add     r0, r0, #8 |  | ||||||
|  |  | ||||||
|     add     r6, r6, #0x40                   ; round_shift_and_clamp |  | ||||||
|     add     r7, r7, #0x40 |  | ||||||
|     usat    r6, #8, r6, asr #7 |  | ||||||
|     usat    r7, #8, r7, asr #7 |  | ||||||
|     strb    r6, [r1], r2                    ; the result is transposed back and stored |  | ||||||
|  |  | ||||||
|     add     r8, r8, #0x40                   ; round_shift_and_clamp |  | ||||||
|     strb    r7, [r1], r2 |  | ||||||
|     add     r9, r9, #0x40 |  | ||||||
|     usat    r8, #8, r8, asr #7 |  | ||||||
|     usat    r9, #8, r9, asr #7 |  | ||||||
|     strb    r8, [r1], r2                    ; the result is transposed back and stored |  | ||||||
|  |  | ||||||
|     ldrne   r6, [r0]                        ; load data |  | ||||||
|     strb    r9, [r1], r2 |  | ||||||
|     ldrne   r8, [r0, #4] |  | ||||||
|     ldrneh  r10, [r0, #8] |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_2nd |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, #4                      ; update src for next row |  | ||||||
|     add     r11, r11, #1 |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_2nd |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|  |  | ||||||
| |bil_null_2nd_filter| |  | ||||||
| |bil_height_loop_null_2nd| |  | ||||||
|     mov     lr, r3, lsr #2 |  | ||||||
|  |  | ||||||
| |bil_width_loop_null_2nd| |  | ||||||
|     ldr     r6, [r0], #4                    ; load data |  | ||||||
|     subs    lr, lr, #1 |  | ||||||
|     ldr     r8, [r0], #4 |  | ||||||
|  |  | ||||||
|     strb    r6, [r1], r2                    ; store data |  | ||||||
|     mov     r7, r6, lsr #16 |  | ||||||
|     strb    r7, [r1], r2 |  | ||||||
|     mov     r9, r8, lsr #16 |  | ||||||
|     strb    r8, [r1], r2 |  | ||||||
|     strb    r9, [r1], r2 |  | ||||||
|  |  | ||||||
|     bne     bil_width_loop_null_2nd |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     add     r0, r0, #4 |  | ||||||
|     add     r11, r11, #1 |  | ||||||
|     mov     r1, r11 |  | ||||||
|  |  | ||||||
|     bne     bil_height_loop_null_2nd |  | ||||||
|  |  | ||||||
|     ldmia   sp!, {r4 - r11, pc} |  | ||||||
|     ENDP  ; |vpx_filter_block2d_second_pass_media| |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,95 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vpx_sad16x16_media| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | ||||||
|  |  | ||||||
| ; r0    const unsigned char *src_ptr |  | ||||||
| ; r1    int  src_stride |  | ||||||
| ; r2    const unsigned char *ref_ptr |  | ||||||
| ; r3    int  ref_stride |  | ||||||
| |vpx_sad16x16_media| PROC |  | ||||||
|     stmfd   sp!, {r4-r12, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|  |  | ||||||
|     mov     r4, #0              ; sad = 0; |  | ||||||
|     mov     r5, #8              ; loop count |  | ||||||
|  |  | ||||||
| loop |  | ||||||
|     ; 1st row |  | ||||||
|     ldr     r6, [r0, #0x0]      ; load 4 src pixels (1A) |  | ||||||
|     ldr     r8, [r2, #0x0]      ; load 4 ref pixels (1A) |  | ||||||
|     ldr     r7, [r0, #0x4]      ; load 4 src pixels (1A) |  | ||||||
|     ldr     r9, [r2, #0x4]      ; load 4 ref pixels (1A) |  | ||||||
|     ldr     r10, [r0, #0x8]     ; load 4 src pixels (1B) |  | ||||||
|     ldr     r11, [r0, #0xC]     ; load 4 src pixels (1B) |  | ||||||
|  |  | ||||||
|     usada8  r4, r8, r6, r4      ; calculate sad for 4 pixels |  | ||||||
|     usad8   r8, r7, r9          ; calculate sad for 4 pixels |  | ||||||
|  |  | ||||||
|     ldr     r12, [r2, #0x8]     ; load 4 ref pixels (1B) |  | ||||||
|     ldr     lr, [r2, #0xC]      ; load 4 ref pixels (1B) |  | ||||||
|  |  | ||||||
|     add     r0, r0, r1          ; set src pointer to next row |  | ||||||
|     add     r2, r2, r3          ; set dst pointer to next row |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|  |  | ||||||
|     usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels |  | ||||||
|     usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels |  | ||||||
|  |  | ||||||
|     ldr     r6, [r0, #0x0]      ; load 4 src pixels (2A) |  | ||||||
|     ldr     r7, [r0, #0x4]      ; load 4 src pixels (2A) |  | ||||||
|     add     r4, r4, r8          ; add partial sad values |  | ||||||
|  |  | ||||||
|     ; 2nd row |  | ||||||
|     ldr     r8, [r2, #0x0]      ; load 4 ref pixels (2A) |  | ||||||
|     ldr     r9, [r2, #0x4]      ; load 4 ref pixels (2A) |  | ||||||
|     ldr     r10, [r0, #0x8]     ; load 4 src pixels (2B) |  | ||||||
|     ldr     r11, [r0, #0xC]     ; load 4 src pixels (2B) |  | ||||||
|  |  | ||||||
|     usada8  r4, r6, r8, r4      ; calculate sad for 4 pixels |  | ||||||
|     usad8   r8, r7, r9          ; calculate sad for 4 pixels |  | ||||||
|  |  | ||||||
|     ldr     r12, [r2, #0x8]     ; load 4 ref pixels (2B) |  | ||||||
|     ldr     lr, [r2, #0xC]      ; load 4 ref pixels (2B) |  | ||||||
|  |  | ||||||
|     add     r0, r0, r1          ; set src pointer to next row |  | ||||||
|     add     r2, r2, r3          ; set dst pointer to next row |  | ||||||
|  |  | ||||||
|     usada8  r4, r10, r12, r4    ; calculate sad for 4 pixels |  | ||||||
|     usada8  r8, r11, lr, r8     ; calculate sad for 4 pixels |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|  |  | ||||||
|     subs    r5, r5, #1          ; decrement loop counter |  | ||||||
|     add     r4, r4, r8          ; add partial sad values |  | ||||||
|  |  | ||||||
|     bne     loop |  | ||||||
|  |  | ||||||
|     mov     r0, r4              ; return sad |  | ||||||
|     ldmfd   sp!, {r4-r12, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
|     END |  | ||||||
|  |  | ||||||
| @@ -1,80 +0,0 @@ | |||||||
| /* |  | ||||||
|  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved. |  | ||||||
|  * |  | ||||||
|  *  Use of this source code is governed by a BSD-style license |  | ||||||
|  *  that can be found in the LICENSE file in the root of the source |  | ||||||
|  *  tree. An additional intellectual property rights grant can be found |  | ||||||
|  *  in the file PATENTS.  All contributing project authors may |  | ||||||
|  *  be found in the AUTHORS file in the root of the source tree. |  | ||||||
|  */ |  | ||||||
|  |  | ||||||
| #include "./vpx_config.h" |  | ||||||
| #include "./vpx_dsp_rtcd.h" |  | ||||||
| #include "vpx/vpx_integer.h" |  | ||||||
| #include "vpx_ports/mem.h" |  | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
| static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, |  | ||||||
|                                                       { 96, 32 }, { 80, 48 }, |  | ||||||
|                                                       { 64, 64 }, { 48, 80 }, |  | ||||||
|                                                       { 32, 96 }, { 16, 112 } }; |  | ||||||
|  |  | ||||||
| extern void vpx_filter_block2d_bil_first_pass_media( |  | ||||||
|     const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, |  | ||||||
|     uint32_t height, uint32_t width, const int16_t *filter); |  | ||||||
|  |  | ||||||
| extern void vpx_filter_block2d_bil_second_pass_media( |  | ||||||
|     const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, |  | ||||||
|     uint32_t height, uint32_t width, const int16_t *filter); |  | ||||||
|  |  | ||||||
| unsigned int vpx_sub_pixel_variance8x8_media( |  | ||||||
|     const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, |  | ||||||
|     const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { |  | ||||||
|   uint16_t first_pass[10 * 8]; |  | ||||||
|   uint8_t second_pass[8 * 8]; |  | ||||||
|   const int16_t *HFilter, *VFilter; |  | ||||||
|  |  | ||||||
|   HFilter = bilinear_filters_media[xoffset]; |  | ||||||
|   VFilter = bilinear_filters_media[yoffset]; |  | ||||||
|  |  | ||||||
|   vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, |  | ||||||
|                                           src_pixels_per_line, 9, 8, HFilter); |  | ||||||
|   vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, |  | ||||||
|                                            VFilter); |  | ||||||
|  |  | ||||||
|   return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, |  | ||||||
|                                sse); |  | ||||||
| } |  | ||||||
|  |  | ||||||
| unsigned int vpx_sub_pixel_variance16x16_media( |  | ||||||
|     const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, |  | ||||||
|     const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { |  | ||||||
|   uint16_t first_pass[36 * 16]; |  | ||||||
|   uint8_t second_pass[20 * 16]; |  | ||||||
|   const int16_t *HFilter, *VFilter; |  | ||||||
|   unsigned int var; |  | ||||||
|  |  | ||||||
|   if (xoffset == 4 && yoffset == 0) { |  | ||||||
|     var = vpx_variance_halfpixvar16x16_h_media( |  | ||||||
|         src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); |  | ||||||
|   } else if (xoffset == 0 && yoffset == 4) { |  | ||||||
|     var = vpx_variance_halfpixvar16x16_v_media( |  | ||||||
|         src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); |  | ||||||
|   } else if (xoffset == 4 && yoffset == 4) { |  | ||||||
|     var = vpx_variance_halfpixvar16x16_hv_media( |  | ||||||
|         src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); |  | ||||||
|   } else { |  | ||||||
|     HFilter = bilinear_filters_media[xoffset]; |  | ||||||
|     VFilter = bilinear_filters_media[yoffset]; |  | ||||||
|  |  | ||||||
|     vpx_filter_block2d_bil_first_pass_media( |  | ||||||
|         src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); |  | ||||||
|     vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, |  | ||||||
|                                              16, VFilter); |  | ||||||
|  |  | ||||||
|     var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, |  | ||||||
|                                   sse); |  | ||||||
|   } |  | ||||||
|   return var; |  | ||||||
| } |  | ||||||
| #endif  // HAVE_MEDIA |  | ||||||
| @@ -1,182 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vpx_variance_halfpixvar16x16_h_media| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | ||||||
|  |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    int source_stride |  | ||||||
| ; r2    unsigned char *ref_ptr |  | ||||||
| ; r3    int  recon_stride |  | ||||||
| ; stack unsigned int *sse |  | ||||||
| |vpx_variance_halfpixvar16x16_h_media| PROC |  | ||||||
|  |  | ||||||
|     stmfd   sp!, {r4-r12, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|  |  | ||||||
|     mov     r8, #0              ; initialize sum = 0 |  | ||||||
|     ldr     r10, c80808080 |  | ||||||
|     mov     r11, #0             ; initialize sse = 0 |  | ||||||
|     mov     r12, #16            ; set loop counter to 16 (=block height) |  | ||||||
|     mov     lr, #0              ; constant zero |  | ||||||
| loop |  | ||||||
|     ; 1st 4 pixels |  | ||||||
|     ldr     r4, [r0, #0]        ; load 4 src pixels |  | ||||||
|     ldr     r6, [r0, #1]        ; load 4 src pixels with 1 byte offset |  | ||||||
|     ldr     r5, [r2, #0]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|     ; calculate total sum |  | ||||||
|     adds    r8, r8, r4          ; add positive differences to sum |  | ||||||
|     subs    r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 2nd 4 pixels |  | ||||||
|     ldr     r4, [r0, #4]        ; load 4 src pixels |  | ||||||
|     ldr     r6, [r0, #5]        ; load 4 src pixels with 1 byte offset |  | ||||||
|     ldr     r5, [r2, #4]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 3rd 4 pixels |  | ||||||
|     ldr     r4, [r0, #8]        ; load 4 src pixels |  | ||||||
|     ldr     r6, [r0, #9]        ; load 4 src pixels with 1 byte offset |  | ||||||
|     ldr     r5, [r2, #8]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11  ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 4th 4 pixels |  | ||||||
|     ldr     r4, [r0, #12]       ; load 4 src pixels |  | ||||||
|     ldr     r6, [r0, #13]       ; load 4 src pixels with 1 byte offset |  | ||||||
|     ldr     r5, [r2, #12]       ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     add     r0, r0, r1          ; set src_ptr to next row |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     add     r2, r2, r3          ; set dst_ptr to next row |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     bne     loop |  | ||||||
|  |  | ||||||
|     ; return stuff |  | ||||||
|     ldr     r6, [sp, #40]       ; get address of sse |  | ||||||
|     mul     r0, r8, r8          ; sum * sum |  | ||||||
|     str     r11, [r6]           ; store sse |  | ||||||
|     sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) |  | ||||||
|  |  | ||||||
|     ldmfd   sp!, {r4-r12, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| c80808080 |  | ||||||
|     DCD     0x80808080 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
|  |  | ||||||
| @@ -1,222 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vpx_variance_halfpixvar16x16_hv_media| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | ||||||
|  |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    int source_stride |  | ||||||
| ; r2    unsigned char *ref_ptr |  | ||||||
| ; r3    int  recon_stride |  | ||||||
| ; stack unsigned int *sse |  | ||||||
| |vpx_variance_halfpixvar16x16_hv_media| PROC |  | ||||||
|  |  | ||||||
|     stmfd   sp!, {r4-r12, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|  |  | ||||||
|     mov     r8, #0              ; initialize sum = 0 |  | ||||||
|     ldr     r10, c80808080 |  | ||||||
|     mov     r11, #0             ; initialize sse = 0 |  | ||||||
|     mov     r12, #16            ; set loop counter to 16 (=block height) |  | ||||||
|     mov     lr, #0              ; constant zero |  | ||||||
| loop |  | ||||||
|     add     r9, r0, r1          ; pointer to pixels on the next row |  | ||||||
|     ; 1st 4 pixels |  | ||||||
|     ldr     r4, [r0, #0]        ; load source pixels a, row N |  | ||||||
|     ldr     r6, [r0, #1]        ; load source pixels b, row N |  | ||||||
|     ldr     r5, [r9, #0]        ; load source pixels c, row N+1 |  | ||||||
|     ldr     r7, [r9, #1]        ; load source pixels d, row N+1 |  | ||||||
|  |  | ||||||
|     ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|     ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 |  | ||||||
|     mvn     r7, r7 |  | ||||||
|     uhsub8  r5, r5, r7 |  | ||||||
|     eor     r5, r5, r10 |  | ||||||
|     ; z = (x + y + 1) >> 1, interpolate half pixel values vertically |  | ||||||
|     mvn     r5, r5 |  | ||||||
|     uhsub8  r4, r4, r5 |  | ||||||
|     ldr     r5, [r2, #0]        ; load 4 ref pixels |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|     ; calculate total sum |  | ||||||
|     adds    r8, r8, r4          ; add positive differences to sum |  | ||||||
|     subs    r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 2nd 4 pixels |  | ||||||
|     ldr     r4, [r0, #4]        ; load source pixels a, row N |  | ||||||
|     ldr     r6, [r0, #5]        ; load source pixels b, row N |  | ||||||
|     ldr     r5, [r9, #4]        ; load source pixels c, row N+1 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     ldr     r7, [r9, #5]        ; load source pixels d, row N+1 |  | ||||||
|  |  | ||||||
|     ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|     ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 |  | ||||||
|     mvn     r7, r7 |  | ||||||
|     uhsub8  r5, r5, r7 |  | ||||||
|     eor     r5, r5, r10 |  | ||||||
|     ; z = (x + y + 1) >> 1, interpolate half pixel values vertically |  | ||||||
|     mvn     r5, r5 |  | ||||||
|     uhsub8  r4, r4, r5 |  | ||||||
|     ldr     r5, [r2, #4]        ; load 4 ref pixels |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 3rd 4 pixels |  | ||||||
|     ldr     r4, [r0, #8]        ; load source pixels a, row N |  | ||||||
|     ldr     r6, [r0, #9]        ; load source pixels b, row N |  | ||||||
|     ldr     r5, [r9, #8]        ; load source pixels c, row N+1 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     ldr     r7, [r9, #9]        ; load source pixels d, row N+1 |  | ||||||
|  |  | ||||||
|     ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|     ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 |  | ||||||
|     mvn     r7, r7 |  | ||||||
|     uhsub8  r5, r5, r7 |  | ||||||
|     eor     r5, r5, r10 |  | ||||||
|     ; z = (x + y + 1) >> 1, interpolate half pixel values vertically |  | ||||||
|     mvn     r5, r5 |  | ||||||
|     uhsub8  r4, r4, r5 |  | ||||||
|     ldr     r5, [r2, #8]        ; load 4 ref pixels |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 4th 4 pixels |  | ||||||
|     ldr     r4, [r0, #12]       ; load source pixels a, row N |  | ||||||
|     ldr     r6, [r0, #13]       ; load source pixels b, row N |  | ||||||
|     ldr     r5, [r9, #12]       ; load source pixels c, row N+1 |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|     ldr     r7, [r9, #13]       ; load source pixels d, row N+1 |  | ||||||
|  |  | ||||||
|     ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|     ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 |  | ||||||
|     mvn     r7, r7 |  | ||||||
|     uhsub8  r5, r5, r7 |  | ||||||
|     eor     r5, r5, r10 |  | ||||||
|     ; z = (x + y + 1) >> 1, interpolate half pixel values vertically |  | ||||||
|     mvn     r5, r5 |  | ||||||
|     uhsub8  r4, r4, r5 |  | ||||||
|     ldr     r5, [r2, #12]       ; load 4 ref pixels |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     add     r0, r0, r1          ; set src_ptr to next row |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     add     r2, r2, r3          ; set dst_ptr to next row |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     bne     loop |  | ||||||
|  |  | ||||||
|     ; return stuff |  | ||||||
|     ldr     r6, [sp, #40]       ; get address of sse |  | ||||||
|     mul     r0, r8, r8          ; sum * sum |  | ||||||
|     str     r11, [r6]           ; store sse |  | ||||||
|     sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) |  | ||||||
|  |  | ||||||
|     ldmfd   sp!, {r4-r12, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| c80808080 |  | ||||||
|     DCD     0x80808080 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -1,184 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vpx_variance_halfpixvar16x16_v_media| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | ||||||
|  |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    int source_stride |  | ||||||
| ; r2    unsigned char *ref_ptr |  | ||||||
| ; r3    int  recon_stride |  | ||||||
| ; stack unsigned int *sse |  | ||||||
| |vpx_variance_halfpixvar16x16_v_media| PROC |  | ||||||
|  |  | ||||||
|     stmfd   sp!, {r4-r12, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|  |  | ||||||
|     mov     r8, #0              ; initialize sum = 0 |  | ||||||
|     ldr     r10, c80808080 |  | ||||||
|     mov     r11, #0             ; initialize sse = 0 |  | ||||||
|     mov     r12, #16            ; set loop counter to 16 (=block height) |  | ||||||
|     mov     lr, #0              ; constant zero |  | ||||||
| loop |  | ||||||
|     add     r9, r0, r1          ; set src pointer to next row |  | ||||||
|     ; 1st 4 pixels |  | ||||||
|     ldr     r4, [r0, #0]        ; load 4 src pixels |  | ||||||
|     ldr     r6, [r9, #0]        ; load 4 src pixels from next row |  | ||||||
|     ldr     r5, [r2, #0]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|     ; calculate total sum |  | ||||||
|     adds    r8, r8, r4          ; add positive differences to sum |  | ||||||
|     subs    r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 2nd 4 pixels |  | ||||||
|     ldr     r4, [r0, #4]        ; load 4 src pixels |  | ||||||
|     ldr     r6, [r9, #4]        ; load 4 src pixels from next row |  | ||||||
|     ldr     r5, [r2, #4]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 3rd 4 pixels |  | ||||||
|     ldr     r4, [r0, #8]        ; load 4 src pixels |  | ||||||
|     ldr     r6, [r9, #8]        ; load 4 src pixels from next row |  | ||||||
|     ldr     r5, [r2, #8]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 4th 4 pixels |  | ||||||
|     ldr     r4, [r0, #12]       ; load 4 src pixels |  | ||||||
|     ldr     r6, [r9, #12]       ; load 4 src pixels from next row |  | ||||||
|     ldr     r5, [r2, #12]       ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     ; bilinear interpolation |  | ||||||
|     mvn     r6, r6 |  | ||||||
|     uhsub8  r4, r4, r6 |  | ||||||
|     eor     r4, r4, r10 |  | ||||||
|  |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     add     r0, r0, r1          ; set src_ptr to next row |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r6, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     add     r2, r2, r3          ; set dst_ptr to next row |  | ||||||
|     sel     r6, r6, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r6, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|     smlad   r11, r7, r7, r11    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     bne     loop |  | ||||||
|  |  | ||||||
|     ; return stuff |  | ||||||
|     ldr     r6, [sp, #40]       ; get address of sse |  | ||||||
|     mul     r0, r8, r8          ; sum * sum |  | ||||||
|     str     r11, [r6]           ; store sse |  | ||||||
|     sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) |  | ||||||
|  |  | ||||||
|     ldmfd   sp!, {r4-r12, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| c80808080 |  | ||||||
|     DCD     0x80808080 |  | ||||||
|  |  | ||||||
|     END |  | ||||||
|  |  | ||||||
| @@ -1,358 +0,0 @@ | |||||||
| ; |  | ||||||
| ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved. |  | ||||||
| ; |  | ||||||
| ;  Use of this source code is governed by a BSD-style license |  | ||||||
| ;  that can be found in the LICENSE file in the root of the source |  | ||||||
| ;  tree. An additional intellectual property rights grant can be found |  | ||||||
| ;  in the file PATENTS.  All contributing project authors may |  | ||||||
| ;  be found in the AUTHORS file in the root of the source tree. |  | ||||||
| ; |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     EXPORT  |vpx_variance16x16_media| |  | ||||||
|     EXPORT  |vpx_variance8x8_media| |  | ||||||
|     EXPORT  |vpx_mse16x16_media| |  | ||||||
|  |  | ||||||
|     ARM |  | ||||||
|     REQUIRE8 |  | ||||||
|     PRESERVE8 |  | ||||||
|  |  | ||||||
|     AREA ||.text||, CODE, READONLY, ALIGN=2 |  | ||||||
|  |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    int source_stride |  | ||||||
| ; r2    unsigned char *ref_ptr |  | ||||||
| ; r3    int  recon_stride |  | ||||||
| ; stack unsigned int *sse |  | ||||||
| |vpx_variance16x16_media| PROC |  | ||||||
|  |  | ||||||
|     stmfd   sp!, {r4-r12, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|  |  | ||||||
|     mov     r8, #0              ; initialize sum = 0 |  | ||||||
|     mov     r11, #0             ; initialize sse = 0 |  | ||||||
|     mov     r12, #16            ; set loop counter to 16 (=block height) |  | ||||||
|  |  | ||||||
| loop16x16 |  | ||||||
|     ; 1st 4 pixels |  | ||||||
|     ldr     r4, [r0, #0]        ; load 4 src pixels |  | ||||||
|     ldr     r5, [r2, #0]        ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     mov     lr, #0              ; constant zero |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|     sel     r6, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|     ; calculate total sum |  | ||||||
|     adds    r8, r8, r4          ; add positive differences to sum |  | ||||||
|     subs    r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r10, r6, ror #8     ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 2nd 4 pixels |  | ||||||
|     ldr     r4, [r0, #4]        ; load 4 src pixels |  | ||||||
|     ldr     r5, [r2, #4]        ; load 4 ref pixels |  | ||||||
|     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r10, r6, ror #8     ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 3rd 4 pixels |  | ||||||
|     ldr     r4, [r0, #8]        ; load 4 src pixels |  | ||||||
|     ldr     r5, [r2, #8]        ; load 4 ref pixels |  | ||||||
|     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     sel     r6, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r10, r6, ror #8     ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 4th 4 pixels |  | ||||||
|     ldr     r4, [r0, #12]       ; load 4 src pixels |  | ||||||
|     ldr     r5, [r2, #12]       ; load 4 ref pixels |  | ||||||
|     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r6, r4, r5          ; calculate difference |  | ||||||
|     add     r0, r0, r1          ; set src_ptr to next row |  | ||||||
|     sel     r7, r6, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r5, r4          ; calculate difference with reversed operands |  | ||||||
|     add     r2, r2, r3          ; set dst_ptr to next row |  | ||||||
|     sel     r6, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r4, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r5, r6, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r6, r6, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r8, r8, r4          ; add positive differences to sum |  | ||||||
|     sub     r8, r8, r5          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r5, r6              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r10, r6, ror #8     ; another two pixels to halfwords |  | ||||||
|     smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1) |  | ||||||
|     smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1 |  | ||||||
|  |  | ||||||
|     bne     loop16x16 |  | ||||||
|  |  | ||||||
|     ; return stuff |  | ||||||
|     ldr     r6, [sp, #40]       ; get address of sse |  | ||||||
|     mul     r0, r8, r8          ; sum * sum |  | ||||||
|     str     r11, [r6]           ; store sse |  | ||||||
|     sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) |  | ||||||
|  |  | ||||||
|     ldmfd   sp!, {r4-r12, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    int source_stride |  | ||||||
| ; r2    unsigned char *ref_ptr |  | ||||||
| ; r3    int  recon_stride |  | ||||||
| ; stack unsigned int *sse |  | ||||||
| |vpx_variance8x8_media| PROC |  | ||||||
|  |  | ||||||
|     push    {r4-r10, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|  |  | ||||||
|     mov     r12, #8             ; set loop counter to 8 (=block height) |  | ||||||
|     mov     r4, #0              ; initialize sum = 0 |  | ||||||
|     mov     r5, #0              ; initialize sse = 0 |  | ||||||
|  |  | ||||||
| loop8x8 |  | ||||||
|     ; 1st 4 pixels |  | ||||||
|     ldr     r6, [r0, #0x0]      ; load 4 src pixels |  | ||||||
|     ldr     r7, [r2, #0x0]      ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     mov     lr, #0              ; constant zero |  | ||||||
|  |  | ||||||
|     usub8   r8, r6, r7          ; calculate difference |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     sel     r10, r8, lr         ; select bytes with positive difference |  | ||||||
|     usub8   r9, r7, r6          ; calculate difference with reversed operands |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|     sel     r8, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r6, r10, lr         ; calculate sum of positive differences |  | ||||||
|     usad8   r7, r8, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r8, r8, r10         ; differences of all 4 pixels |  | ||||||
|     ; calculate total sum |  | ||||||
|     add    r4, r4, r6           ; add positive differences to sum |  | ||||||
|     sub    r4, r4, r7           ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r7, r8              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r10, r8, ror #8     ; another two pixels to halfwords |  | ||||||
|     smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 2nd 4 pixels |  | ||||||
|     ldr     r6, [r0, #0x4]      ; load 4 src pixels |  | ||||||
|     ldr     r7, [r2, #0x4]      ; load 4 ref pixels |  | ||||||
|     smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r8, r6, r7          ; calculate difference |  | ||||||
|     add     r0, r0, r1          ; set src_ptr to next row |  | ||||||
|     sel     r10, r8, lr         ; select bytes with positive difference |  | ||||||
|     usub8   r9, r7, r6          ; calculate difference with reversed operands |  | ||||||
|     add     r2, r2, r3          ; set dst_ptr to next row |  | ||||||
|     sel     r8, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r6, r10, lr         ; calculate sum of positive differences |  | ||||||
|     usad8   r7, r8, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r8, r8, r10         ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ; calculate total sum |  | ||||||
|     add     r4, r4, r6          ; add positive differences to sum |  | ||||||
|     sub     r4, r4, r7          ; subtract negative differences from sum |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r7, r8              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r10, r8, ror #8     ; another two pixels to halfwords |  | ||||||
|     smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1) |  | ||||||
|     subs    r12, r12, #1        ; next row |  | ||||||
|     smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     bne     loop8x8 |  | ||||||
|  |  | ||||||
|     ; return stuff |  | ||||||
|     ldr     r8, [sp, #32]       ; get address of sse |  | ||||||
|     mul     r1, r4, r4          ; sum * sum |  | ||||||
|     str     r5, [r8]            ; store sse |  | ||||||
|     sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6)) |  | ||||||
|  |  | ||||||
|     pop     {r4-r10, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
| ; r0    unsigned char *src_ptr |  | ||||||
| ; r1    int source_stride |  | ||||||
| ; r2    unsigned char *ref_ptr |  | ||||||
| ; r3    int  recon_stride |  | ||||||
| ; stack unsigned int *sse |  | ||||||
| ; |  | ||||||
| ;note: Based on vpx_variance16x16_media. In this function, sum is never used. |  | ||||||
| ;      So, we can remove this part of calculation. |  | ||||||
|  |  | ||||||
| |vpx_mse16x16_media| PROC |  | ||||||
|  |  | ||||||
|     push    {r4-r9, lr} |  | ||||||
|  |  | ||||||
|     pld     [r0, r1, lsl #0] |  | ||||||
|     pld     [r2, r3, lsl #0] |  | ||||||
|  |  | ||||||
|     mov     r12, #16            ; set loop counter to 16 (=block height) |  | ||||||
|     mov     r4, #0              ; initialize sse = 0 |  | ||||||
|  |  | ||||||
| loopmse |  | ||||||
|     ; 1st 4 pixels |  | ||||||
|     ldr     r5, [r0, #0x0]      ; load 4 src pixels |  | ||||||
|     ldr     r6, [r2, #0x0]      ; load 4 ref pixels |  | ||||||
|  |  | ||||||
|     mov     lr, #0              ; constant zero |  | ||||||
|  |  | ||||||
|     usub8   r8, r5, r6          ; calculate difference |  | ||||||
|     pld     [r0, r1, lsl #1] |  | ||||||
|     sel     r7, r8, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r6, r5          ; calculate difference with reversed operands |  | ||||||
|     pld     [r2, r3, lsl #1] |  | ||||||
|     sel     r8, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r5, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r6, r8, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r8, r8, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ldr     r5, [r0, #0x4]      ; load 4 src pixels |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r6, r8              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r8, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 2nd 4 pixels |  | ||||||
|     ldr     r6, [r2, #0x4]      ; load 4 ref pixels |  | ||||||
|     smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r8, r5, r6          ; calculate difference |  | ||||||
|     sel     r7, r8, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r6, r5          ; calculate difference with reversed operands |  | ||||||
|     sel     r8, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r5, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r6, r8, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r8, r8, r7          ; differences of all 4 pixels |  | ||||||
|     ldr     r5, [r0, #0x8]      ; load 4 src pixels |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r6, r8              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r8, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 3rd 4 pixels |  | ||||||
|     ldr     r6, [r2, #0x8]      ; load 4 ref pixels |  | ||||||
|     smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r8, r5, r6          ; calculate difference |  | ||||||
|     sel     r7, r8, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r6, r5          ; calculate difference with reversed operands |  | ||||||
|     sel     r8, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r5, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r6, r8, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r8, r8, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     ldr     r5, [r0, #0xc]      ; load 4 src pixels |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r6, r8              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r8, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1) |  | ||||||
|  |  | ||||||
|     ; 4th 4 pixels |  | ||||||
|     ldr     r6, [r2, #0xc]      ; load 4 ref pixels |  | ||||||
|     smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     usub8   r8, r5, r6          ; calculate difference |  | ||||||
|     add     r0, r0, r1          ; set src_ptr to next row |  | ||||||
|     sel     r7, r8, lr          ; select bytes with positive difference |  | ||||||
|     usub8   r9, r6, r5          ; calculate difference with reversed operands |  | ||||||
|     add     r2, r2, r3          ; set dst_ptr to next row |  | ||||||
|     sel     r8, r9, lr          ; select bytes with negative difference |  | ||||||
|  |  | ||||||
|     ; calculate partial sums |  | ||||||
|     usad8   r5, r7, lr          ; calculate sum of positive differences |  | ||||||
|     usad8   r6, r8, lr          ; calculate sum of negative differences |  | ||||||
|     orr     r8, r8, r7          ; differences of all 4 pixels |  | ||||||
|  |  | ||||||
|     subs    r12, r12, #1        ; next row |  | ||||||
|  |  | ||||||
|     ; calculate sse |  | ||||||
|     uxtb16  r6, r8              ; byte (two pixels) to halfwords |  | ||||||
|     uxtb16  r7, r8, ror #8      ; another two pixels to halfwords |  | ||||||
|     smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1) |  | ||||||
|     smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2) |  | ||||||
|  |  | ||||||
|     bne     loopmse |  | ||||||
|  |  | ||||||
|     ; return stuff |  | ||||||
|     ldr     r1, [sp, #28]       ; get address of sse |  | ||||||
|     mov     r0, r4              ; return sse |  | ||||||
|     str     r4, [r1]            ; store sse |  | ||||||
|  |  | ||||||
|     pop     {r4-r9, pc} |  | ||||||
|  |  | ||||||
|     ENDP |  | ||||||
|  |  | ||||||
|     END |  | ||||||
| @@ -271,7 +271,6 @@ DSP_SRCS-yes            += subtract.c | |||||||
| DSP_SRCS-yes            += sum_squares.c | DSP_SRCS-yes            += sum_squares.c | ||||||
| DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c | DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c | ||||||
|  |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM) |  | ||||||
| DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c | DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c | ||||||
| DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c | DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c | ||||||
| DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c | DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c | ||||||
| @@ -302,12 +301,6 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC) | |||||||
| DSP_SRCS-yes            += variance.c | DSP_SRCS-yes            += variance.c | ||||||
| DSP_SRCS-yes            += variance.h | DSP_SRCS-yes            += variance.h | ||||||
|  |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM) |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM) |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM) |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM) |  | ||||||
| DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM) |  | ||||||
| DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c | DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c | ||||||
| DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c | DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c | ||||||
|  |  | ||||||
|   | |||||||
| @@ -960,7 +960,7 @@ add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride | |||||||
| specialize qw/vpx_sad16x32 msa sse2/; | specialize qw/vpx_sad16x32 msa sse2/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; | add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; | ||||||
| specialize qw/vpx_sad16x16 media neon msa sse2/; | specialize qw/vpx_sad16x16 neon msa sse2/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; | add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; | ||||||
| specialize qw/vpx_sad16x8 neon msa sse2/; | specialize qw/vpx_sad16x8 neon msa sse2/; | ||||||
| @@ -1387,7 +1387,7 @@ add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int sourc | |||||||
|   specialize qw/vpx_variance16x32 sse2 msa/; |   specialize qw/vpx_variance16x32 sse2 msa/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | ||||||
|   specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/; |   specialize qw/vpx_variance16x16 sse2 avx2 neon msa/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | ||||||
|   specialize qw/vpx_variance16x8 sse2 neon msa/; |   specialize qw/vpx_variance16x8 sse2 neon msa/; | ||||||
| @@ -1396,7 +1396,7 @@ add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source | |||||||
|   specialize qw/vpx_variance8x16 sse2 neon msa/; |   specialize qw/vpx_variance8x16 sse2 neon msa/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | ||||||
|   specialize qw/vpx_variance8x8 sse2 media neon msa/; |   specialize qw/vpx_variance8x8 sse2 neon msa/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | ||||||
|   specialize qw/vpx_variance8x4 sse2 msa/; |   specialize qw/vpx_variance8x4 sse2 msa/; | ||||||
| @@ -1417,7 +1417,7 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co | |||||||
|   specialize qw/vpx_get8x8var sse2 neon msa/; |   specialize qw/vpx_get8x8var sse2 neon msa/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"; | add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"; | ||||||
|   specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/; |   specialize qw/vpx_mse16x16 sse2 avx2 neon msa/; | ||||||
|  |  | ||||||
| add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"; | add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse"; | ||||||
|   specialize qw/vpx_mse16x8 sse2 msa/; |   specialize qw/vpx_mse16x8 sse2 msa/; | ||||||
| @@ -1458,7 +1458,7 @@ add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int | |||||||
|   specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/; |   specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/; | ||||||
|  |  | ||||||
| add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_sub_pixel_variance16x16 media neon msa sse2 ssse3/; |   specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/; | ||||||
|  |  | ||||||
| add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/; |   specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/; | ||||||
| @@ -1467,7 +1467,7 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int | |||||||
|   specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/; |   specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/; | ||||||
|  |  | ||||||
| add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_sub_pixel_variance8x8 media neon msa sse2 ssse3/; |   specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/; | ||||||
|  |  | ||||||
| add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/; |   specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/; | ||||||
| @@ -1520,14 +1520,19 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i | |||||||
| # | # | ||||||
| # Specialty Subpixel | # Specialty Subpixel | ||||||
| # | # | ||||||
|  | # TODO(johannkoenig): Add neon implementations of | ||||||
|  | #  vpx_variance_halfpixvar16x16_h | ||||||
|  | #  vpx_variance_halfpixvar16x16_v | ||||||
|  | #  vpx_variance_halfpixvar16x16_hv | ||||||
|  | # https://bugs.chromium.org/p/webm/issues/detail?id=1273 | ||||||
| add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/; |   specialize qw/vpx_variance_halfpixvar16x16_h sse2/; | ||||||
|  |  | ||||||
| add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/; |   specialize qw/vpx_variance_halfpixvar16x16_v sse2/; | ||||||
|  |  | ||||||
| add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse"; | add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse"; | ||||||
|   specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/; |   specialize qw/vpx_variance_halfpixvar16x16_hv sse2/; | ||||||
|  |  | ||||||
| if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { | if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { | ||||||
|   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; |   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; | ||||||
|   | |||||||
| @@ -10,8 +10,9 @@ | |||||||
|  |  | ||||||
| #include <stdlib.h> | #include <stdlib.h> | ||||||
| #include <string.h> | #include <string.h> | ||||||
| #include "vpx_ports/arm.h" |  | ||||||
| #include "./vpx_config.h" | #include "./vpx_config.h" | ||||||
|  | #include "vpx_ports/arm.h" | ||||||
|  |  | ||||||
| #ifdef WINAPI_FAMILY | #ifdef WINAPI_FAMILY | ||||||
| #include <winapifamily.h> | #include <winapifamily.h> | ||||||
| @@ -49,9 +50,6 @@ int arm_cpu_caps(void) { | |||||||
|     return flags; |     return flags; | ||||||
|   } |   } | ||||||
|   mask = arm_cpu_env_mask(); |   mask = arm_cpu_env_mask(); | ||||||
| #if HAVE_MEDIA |  | ||||||
|   flags |= HAS_MEDIA; |  | ||||||
| #endif /* HAVE_MEDIA */ |  | ||||||
| #if HAVE_NEON || HAVE_NEON_ASM | #if HAVE_NEON || HAVE_NEON_ASM | ||||||
|   flags |= HAS_NEON; |   flags |= HAS_NEON; | ||||||
| #endif /* HAVE_NEON  || HAVE_NEON_ASM */ | #endif /* HAVE_NEON  || HAVE_NEON_ASM */ | ||||||
| @@ -75,28 +73,18 @@ int arm_cpu_caps(void) { | |||||||
|  *  instructions via their assembled hex code. |  *  instructions via their assembled hex code. | ||||||
|  * All of these instructions should be essentially nops. |  * All of these instructions should be essentially nops. | ||||||
|  */ |  */ | ||||||
| #if HAVE_MEDIA | #if HAVE_NEON || HAVE_NEON_ASM | ||||||
|   if (mask & HAS_MEDIA) __try { |   if (mask & HAS_NEON) { | ||||||
|       /*SHADD8 r3,r3,r3*/ |     __try { | ||||||
|       __emit(0xE6333F93); |       /*VORR q0,q0,q0*/ | ||||||
|       flags |= HAS_MEDIA; |       __emit(0xF2200150); | ||||||
|  |       flags |= HAS_NEON; | ||||||
|     } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { |     } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { | ||||||
|       /*Ignore exception.*/ |       /*Ignore exception.*/ | ||||||
|     } |     } | ||||||
| } |  | ||||||
| #endif /* HAVE_MEDIA */ |  | ||||||
| #if HAVE_NEON || HAVE_NEON_ASM |  | ||||||
| if (mask & HAS_NEON) { |  | ||||||
|   __try { |  | ||||||
|     /*VORR q0,q0,q0*/ |  | ||||||
|     __emit(0xF2200150); |  | ||||||
|     flags |= HAS_NEON; |  | ||||||
|   } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { |  | ||||||
|     /*Ignore exception.*/ |  | ||||||
|   } |   } | ||||||
| } |  | ||||||
| #endif /* HAVE_NEON || HAVE_NEON_ASM */ | #endif /* HAVE_NEON || HAVE_NEON_ASM */ | ||||||
| return flags & mask; |   return flags & mask; | ||||||
| } | } | ||||||
|  |  | ||||||
| #elif defined(__ANDROID__) /* end _MSC_VER */ | #elif defined(__ANDROID__) /* end _MSC_VER */ | ||||||
| @@ -112,9 +100,6 @@ int arm_cpu_caps(void) { | |||||||
|   mask = arm_cpu_env_mask(); |   mask = arm_cpu_env_mask(); | ||||||
|   features = android_getCpuFeatures(); |   features = android_getCpuFeatures(); | ||||||
|  |  | ||||||
| #if HAVE_MEDIA |  | ||||||
|   flags |= HAS_MEDIA; |  | ||||||
| #endif /* HAVE_MEDIA */ |  | ||||||
| #if HAVE_NEON || HAVE_NEON_ASM | #if HAVE_NEON || HAVE_NEON_ASM | ||||||
|   if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; |   if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; | ||||||
| #endif /* HAVE_NEON || HAVE_NEON_ASM */ | #endif /* HAVE_NEON || HAVE_NEON_ASM */ | ||||||
| @@ -153,15 +138,6 @@ int arm_cpu_caps(void) { | |||||||
|         } |         } | ||||||
|       } |       } | ||||||
| #endif /* HAVE_NEON || HAVE_NEON_ASM */ | #endif /* HAVE_NEON || HAVE_NEON_ASM */ | ||||||
| #if HAVE_MEDIA |  | ||||||
|       if (memcmp(buf, "CPU architecture:", 17) == 0) { |  | ||||||
|         int version; |  | ||||||
|         version = atoi(buf + 17); |  | ||||||
|         if (version >= 6) { |  | ||||||
|           flags |= HAS_MEDIA; |  | ||||||
|         } |  | ||||||
|       } |  | ||||||
| #endif /* HAVE_MEDIA */ |  | ||||||
|     } |     } | ||||||
|     fclose(fin); |     fclose(fin); | ||||||
|   } |   } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Johann
					Johann