Remove armv6 target
Change-Id: I1fa81cc9cabf362a185fc3a53f1e58de533a41e5
This commit is contained in:
parent
476e8fc855
commit
d55724fae9
3
README
3
README
@ -49,9 +49,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
|
||||
|
||||
arm64-darwin-gcc
|
||||
arm64-linux-gcc
|
||||
armv6-linux-rvct
|
||||
armv6-linux-gcc
|
||||
armv6-none-rvct
|
||||
armv7-android-gcc
|
||||
armv7-darwin-gcc
|
||||
armv7-linux-rvct
|
||||
|
@ -29,11 +29,6 @@
|
||||
# include $(CLEAR_VARS)
|
||||
# include jni/libvpx/build/make/Android.mk
|
||||
#
|
||||
# There are currently two TARGET_ARCH_ABI targets for ARM.
|
||||
# armeabi and armeabi-v7a. armeabi-v7a is selected by creating an
|
||||
# Application.mk in the jni directory that contains:
|
||||
# APP_ABI := armeabi-v7a
|
||||
#
|
||||
# By default libvpx will detect at runtime the existance of NEON extension.
|
||||
# For this we import the 'cpufeatures' module from the NDK sources.
|
||||
# libvpx can also be configured without this runtime detection method.
|
||||
@ -42,9 +37,6 @@
|
||||
# --disable-neon-asm
|
||||
# will remove any NEON dependency.
|
||||
|
||||
# To change to building armeabi, run ./libvpx/configure again, but with
|
||||
# --target=armv6-android-gcc and modify the Application.mk file to
|
||||
# set APP_ABI := armeabi
|
||||
#
|
||||
# Running ndk-build will build libvpx and include it in your project.
|
||||
#
|
||||
@ -59,9 +51,6 @@ ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL)
|
||||
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
|
||||
include $(CONFIG_DIR)libs-armv7-android-gcc.mk
|
||||
LOCAL_ARM_MODE := arm
|
||||
else ifeq ($(TARGET_ARCH_ABI),armeabi)
|
||||
include $(CONFIG_DIR)libs-armv6-android-gcc.mk
|
||||
LOCAL_ARM_MODE := arm
|
||||
else ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
|
||||
include $(CONFIG_DIR)libs-armv8-android-gcc.mk
|
||||
LOCAL_ARM_MODE := arm
|
||||
|
@ -680,9 +680,6 @@ process_common_toolchain() {
|
||||
aarch64*)
|
||||
tgt_isa=arm64
|
||||
;;
|
||||
armv6*)
|
||||
tgt_isa=armv6
|
||||
;;
|
||||
armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf)
|
||||
tgt_isa=armv7
|
||||
float_abi=hard
|
||||
@ -883,36 +880,6 @@ process_common_toolchain() {
|
||||
if disabled neon && enabled neon_asm; then
|
||||
die "Disabling neon while keeping neon-asm is not supported"
|
||||
fi
|
||||
case ${toolchain} in
|
||||
# Apple iOS SDKs no longer support armv6 as of the version 9
|
||||
# release (coincides with release of Xcode 7). Only enable media
|
||||
# when using earlier SDK releases.
|
||||
*-darwin*)
|
||||
if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
|
||||
soft_enable media
|
||||
else
|
||||
soft_disable media
|
||||
RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media "
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
soft_enable media
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
armv6)
|
||||
case ${toolchain} in
|
||||
*-darwin*)
|
||||
if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then
|
||||
soft_enable media
|
||||
else
|
||||
die "Your iOS SDK does not support armv6."
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
soft_enable media
|
||||
;;
|
||||
esac
|
||||
;;
|
||||
esac
|
||||
|
||||
|
@ -384,13 +384,8 @@ if ($opts{arch} eq 'x86') {
|
||||
}
|
||||
close CONFIG_FILE;
|
||||
mips;
|
||||
} elsif ($opts{arch} eq 'armv6') {
|
||||
@ALL_ARCHS = filter(qw/media/);
|
||||
arm;
|
||||
} elsif ($opts{arch} =~ /armv7\w?/) {
|
||||
@ALL_ARCHS = filter(qw/media neon_asm neon/);
|
||||
@REQUIRES = filter(keys %required ? keys %required : qw/media/);
|
||||
&require(@REQUIRES);
|
||||
@ALL_ARCHS = filter(qw/neon_asm neon/);
|
||||
arm;
|
||||
} elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
|
||||
@ALL_ARCHS = filter(qw/neon/);
|
||||
|
5
configure
vendored
5
configure
vendored
@ -99,9 +99,6 @@ EOF
|
||||
# alphabetically by architecture, generic-gnu last.
|
||||
all_platforms="${all_platforms} arm64-darwin-gcc"
|
||||
all_platforms="${all_platforms} arm64-linux-gcc"
|
||||
all_platforms="${all_platforms} armv6-linux-rvct"
|
||||
all_platforms="${all_platforms} armv6-linux-gcc"
|
||||
all_platforms="${all_platforms} armv6-none-rvct"
|
||||
all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8
|
||||
all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8
|
||||
all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8
|
||||
@ -236,8 +233,6 @@ ARCH_EXT_LIST_X86="
|
||||
avx2
|
||||
"
|
||||
ARCH_EXT_LIST="
|
||||
edsp
|
||||
media
|
||||
neon
|
||||
neon_asm
|
||||
|
||||
|
@ -640,13 +640,6 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// ARM functions
|
||||
#if HAVE_MEDIA
|
||||
const SadMxNParam media_tests[] = {
|
||||
SadMxNParam(16, 16, &vpx_sad16x16_media),
|
||||
};
|
||||
INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests));
|
||||
#endif // HAVE_MEDIA
|
||||
|
||||
#if HAVE_NEON
|
||||
const SadMxNParam neon_tests[] = {
|
||||
SadMxNParam(64, 64, &vpx_sad64x64_neon),
|
||||
|
@ -1205,22 +1205,6 @@ INSTANTIATE_TEST_CASE_P(
|
||||
make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
|
||||
#endif // HAVE_AVX2
|
||||
|
||||
#if HAVE_MEDIA
|
||||
INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
|
||||
::testing::Values(MseParams(4, 4,
|
||||
&vpx_mse16x16_media)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MEDIA, VpxVarianceTest,
|
||||
::testing::Values(VarianceParams(4, 4, &vpx_variance16x16_media),
|
||||
VarianceParams(3, 3, &vpx_variance8x8_media)));
|
||||
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
MEDIA, VpxSubpelVarianceTest,
|
||||
::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_media, 0),
|
||||
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_media, 0)));
|
||||
#endif // HAVE_MEDIA
|
||||
|
||||
#if HAVE_NEON
|
||||
INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
|
||||
::testing::Values(SseParams(2, 2,
|
||||
|
@ -1,237 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_filter_block2d_bil_first_pass_armv6|
|
||||
EXPORT |vp8_filter_block2d_bil_second_pass_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned short *dst_ptr,
|
||||
; r2 unsigned int src_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vp8_filter
|
||||
;-------------------------------------
|
||||
; The output is transposed stroed in output array to make it easy for second pass filtering.
|
||||
|vp8_filter_block2d_bil_first_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp8_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
mov r12, r3 ; outer-loop counter
|
||||
|
||||
add r7, r2, r4 ; preload next row
|
||||
pld [r0, r7]
|
||||
|
||||
sub r2, r2, r4 ; src increment for height loop
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
|
||||
mov r3, r3, lsl #1 ; height*2
|
||||
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
|
||||
|
||||
mov r11, r1 ; save dst_ptr for each row
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_1st_filter
|
||||
|
||||
|bil_height_loop_1st_v6|
|
||||
ldrb r6, [r0] ; load source data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|
||||
|
||||
|bil_width_loop_1st_v6|
|
||||
ldrb r9, [r0, #3]
|
||||
ldrb r10, [r0, #4]
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
|
||||
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
|
||||
|
||||
smuad r6, r6, r5 ; apply the filter
|
||||
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
|
||||
smuad r7, r7, r5
|
||||
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
|
||||
|
||||
smuad r8, r8, r5
|
||||
smuad r9, r9, r5
|
||||
|
||||
add r0, r0, #4
|
||||
subs lr, lr, #1
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #16, r6, asr #7
|
||||
usat r7, #16, r7, asr #7
|
||||
|
||||
strh r6, [r1], r3 ; result is transposed and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strh r7, [r1], r3
|
||||
add r9, r9, #0x40
|
||||
usat r8, #16, r8, asr #7
|
||||
usat r9, #16, r9, asr #7
|
||||
|
||||
strh r8, [r1], r3 ; result is transposed and stored
|
||||
|
||||
ldrneb r6, [r0] ; load source data
|
||||
strh r9, [r1], r3
|
||||
|
||||
ldrneb r7, [r0, #1]
|
||||
ldrneb r8, [r0, #2]
|
||||
|
||||
bne bil_width_loop_1st_v6
|
||||
|
||||
add r0, r0, r2 ; move to next input row
|
||||
subs r12, r12, #1
|
||||
|
||||
add r9, r2, r4, lsl #1 ; adding back block width
|
||||
pld [r0, r9] ; preload next row
|
||||
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_1st_v6
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_1st_filter|
|
||||
|bil_height_loop_null_1st|
|
||||
mov lr, r4, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_null_1st|
|
||||
ldrb r6, [r0] ; load data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
ldrb r9, [r0, #3]
|
||||
|
||||
strh r6, [r1], r3 ; store it to immediate buffer
|
||||
add r0, r0, #4
|
||||
strh r7, [r1], r3
|
||||
subs lr, lr, #1
|
||||
strh r8, [r1], r3
|
||||
strh r9, [r1], r3
|
||||
|
||||
bne bil_width_loop_null_1st
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r2 ; move to next input line
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_1st
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP ; |vp8_filter_block2d_bil_first_pass_armv6|
|
||||
|
||||
|
||||
;---------------------------------
|
||||
; r0 unsigned short *src_ptr,
|
||||
; r1 unsigned char *dst_ptr,
|
||||
; r2 int dst_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vp8_filter
|
||||
;---------------------------------
|
||||
|vp8_filter_block2d_bil_second_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp8_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
|
||||
mov r11, r1
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_2nd_filter
|
||||
|
||||
|bil_height_loop_2nd|
|
||||
ldr r6, [r0] ; load the data
|
||||
ldr r8, [r0, #4]
|
||||
ldrh r10, [r0, #8]
|
||||
mov lr, r3, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_2nd|
|
||||
pkhtb r7, r6, r8 ; src[1] | src[2]
|
||||
pkhtb r9, r8, r10 ; src[3] | src[4]
|
||||
|
||||
smuad r6, r6, r5 ; apply filter
|
||||
smuad r8, r8, r5 ; apply filter
|
||||
|
||||
subs lr, lr, #1
|
||||
|
||||
smuadx r7, r7, r5 ; apply filter
|
||||
smuadx r9, r9, r5 ; apply filter
|
||||
|
||||
add r0, r0, #8
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #8, r6, asr #7
|
||||
usat r7, #8, r7, asr #7
|
||||
strb r6, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strb r7, [r1], r2
|
||||
add r9, r9, #0x40
|
||||
usat r8, #8, r8, asr #7
|
||||
usat r9, #8, r9, asr #7
|
||||
strb r8, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
ldrne r6, [r0] ; load data
|
||||
strb r9, [r1], r2
|
||||
ldrne r8, [r0, #4]
|
||||
ldrneh r10, [r0, #8]
|
||||
|
||||
bne bil_width_loop_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4 ; update src for next row
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_2nd
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_2nd_filter|
|
||||
|bil_height_loop_null_2nd|
|
||||
mov lr, r3, lsr #2
|
||||
|
||||
|bil_width_loop_null_2nd|
|
||||
ldr r6, [r0], #4 ; load data
|
||||
subs lr, lr, #1
|
||||
ldr r8, [r0], #4
|
||||
|
||||
strb r6, [r1], r2 ; store data
|
||||
mov r7, r6, lsr #16
|
||||
strb r7, [r1], r2
|
||||
mov r9, r8, lsr #16
|
||||
strb r8, [r1], r2
|
||||
strb r9, [r1], r2
|
||||
|
||||
bne bil_width_loop_null_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_2nd
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_filter_block2d_second_pass_armv6|
|
||||
|
||||
END
|
@ -1,186 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem16x16_v6|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem16x16_v6| PROC
|
||||
stmdb sp!, {r4 - r7}
|
||||
;push {r4-r7}
|
||||
|
||||
;preload
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
|
||||
ands r4, r0, #15
|
||||
beq copy_mem16x16_fast
|
||||
|
||||
ands r4, r0, #7
|
||||
beq copy_mem16x16_8
|
||||
|
||||
ands r4, r0, #3
|
||||
beq copy_mem16x16_4
|
||||
|
||||
;copy one byte each time
|
||||
ldrb r4, [r0]
|
||||
ldrb r5, [r0, #1]
|
||||
ldrb r6, [r0, #2]
|
||||
ldrb r7, [r0, #3]
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_1_loop
|
||||
strb r4, [r2]
|
||||
strb r5, [r2, #1]
|
||||
strb r6, [r2, #2]
|
||||
strb r7, [r2, #3]
|
||||
|
||||
ldrb r4, [r0, #4]
|
||||
ldrb r5, [r0, #5]
|
||||
ldrb r6, [r0, #6]
|
||||
ldrb r7, [r0, #7]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
strb r4, [r2, #4]
|
||||
strb r5, [r2, #5]
|
||||
strb r6, [r2, #6]
|
||||
strb r7, [r2, #7]
|
||||
|
||||
ldrb r4, [r0, #8]
|
||||
ldrb r5, [r0, #9]
|
||||
ldrb r6, [r0, #10]
|
||||
ldrb r7, [r0, #11]
|
||||
|
||||
strb r4, [r2, #8]
|
||||
strb r5, [r2, #9]
|
||||
strb r6, [r2, #10]
|
||||
strb r7, [r2, #11]
|
||||
|
||||
ldrb r4, [r0, #12]
|
||||
ldrb r5, [r0, #13]
|
||||
ldrb r6, [r0, #14]
|
||||
ldrb r7, [r0, #15]
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
strb r4, [r2, #12]
|
||||
strb r5, [r2, #13]
|
||||
strb r6, [r2, #14]
|
||||
strb r7, [r2, #15]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrneb r4, [r0]
|
||||
ldrneb r5, [r0, #1]
|
||||
ldrneb r6, [r0, #2]
|
||||
ldrneb r7, [r0, #3]
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
|
||||
bne copy_mem16x16_1_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
;copy 4 bytes each time
|
||||
copy_mem16x16_4
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r0, #8]
|
||||
ldr r7, [r0, #12]
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_4_loop
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
|
||||
str r4, [r2]
|
||||
str r5, [r2, #4]
|
||||
str r6, [r2, #8]
|
||||
str r7, [r2, #12]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrne r4, [r0]
|
||||
ldrne r5, [r0, #4]
|
||||
ldrne r6, [r0, #8]
|
||||
ldrne r7, [r0, #12]
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
|
||||
bne copy_mem16x16_4_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
;copy 8 bytes each time
|
||||
copy_mem16x16_8
|
||||
sub r1, r1, #16
|
||||
sub r3, r3, #16
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_8_loop
|
||||
ldmia r0!, {r4-r5}
|
||||
;ldm r0, {r4-r5}
|
||||
ldmia r0!, {r6-r7}
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
stmia r2!, {r4-r5}
|
||||
subs r12, r12, #1
|
||||
;stm r2, {r4-r5}
|
||||
stmia r2!, {r6-r7}
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
bne copy_mem16x16_8_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
;copy 16 bytes each time
|
||||
copy_mem16x16_fast
|
||||
;sub r1, r1, #16
|
||||
;sub r3, r3, #16
|
||||
|
||||
mov r12, #16
|
||||
|
||||
copy_mem16x16_fast_loop
|
||||
ldmia r0, {r4-r7}
|
||||
;ldm r0, {r4-r7}
|
||||
add r0, r0, r1
|
||||
|
||||
subs r12, r12, #1
|
||||
stmia r2, {r4-r7}
|
||||
;stm r2, {r4-r7}
|
||||
add r2, r2, r3
|
||||
|
||||
pld [r0, #31] ; preload for next 16x16 block
|
||||
bne copy_mem16x16_fast_loop
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
;pop {r4-r7}
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem16x16_v6|
|
||||
|
||||
END
|
@ -1,128 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem8x4_v6|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem8x4_v6| PROC
|
||||
;push {r4-r5}
|
||||
stmdb sp!, {r4-r5}
|
||||
|
||||
;preload
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
ands r4, r0, #7
|
||||
beq copy_mem8x4_fast
|
||||
|
||||
ands r4, r0, #3
|
||||
beq copy_mem8x4_4
|
||||
|
||||
;copy 1 byte each time
|
||||
ldrb r4, [r0]
|
||||
ldrb r5, [r0, #1]
|
||||
|
||||
mov r12, #4
|
||||
|
||||
copy_mem8x4_1_loop
|
||||
strb r4, [r2]
|
||||
strb r5, [r2, #1]
|
||||
|
||||
ldrb r4, [r0, #2]
|
||||
ldrb r5, [r0, #3]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
strb r4, [r2, #2]
|
||||
strb r5, [r2, #3]
|
||||
|
||||
ldrb r4, [r0, #4]
|
||||
ldrb r5, [r0, #5]
|
||||
|
||||
strb r4, [r2, #4]
|
||||
strb r5, [r2, #5]
|
||||
|
||||
ldrb r4, [r0, #6]
|
||||
ldrb r5, [r0, #7]
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
strb r4, [r2, #6]
|
||||
strb r5, [r2, #7]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrneb r4, [r0]
|
||||
ldrneb r5, [r0, #1]
|
||||
|
||||
bne copy_mem8x4_1_loop
|
||||
|
||||
ldmia sp!, {r4 - r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 4 bytes each time
|
||||
copy_mem8x4_4
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
|
||||
mov r12, #4
|
||||
|
||||
copy_mem8x4_4_loop
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
|
||||
str r4, [r2]
|
||||
str r5, [r2, #4]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrne r4, [r0]
|
||||
ldrne r5, [r0, #4]
|
||||
|
||||
bne copy_mem8x4_4_loop
|
||||
|
||||
ldmia sp!, {r4-r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 8 bytes each time
|
||||
copy_mem8x4_fast
|
||||
;sub r1, r1, #8
|
||||
;sub r3, r3, #8
|
||||
|
||||
mov r12, #4
|
||||
|
||||
copy_mem8x4_fast_loop
|
||||
ldmia r0, {r4-r5}
|
||||
;ldm r0, {r4-r5}
|
||||
add r0, r0, r1
|
||||
|
||||
subs r12, r12, #1
|
||||
stmia r2, {r4-r5}
|
||||
;stm r2, {r4-r5}
|
||||
add r2, r2, r3
|
||||
|
||||
bne copy_mem8x4_fast_loop
|
||||
|
||||
ldmia sp!, {r4-r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem8x4_v6|
|
||||
|
||||
END
|
@ -1,128 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_copy_mem8x8_v6|
|
||||
; ARM
|
||||
; REQUIRE8
|
||||
; PRESERVE8
|
||||
|
||||
AREA Block, CODE, READONLY ; name this block of code
|
||||
;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
|vp8_copy_mem8x8_v6| PROC
|
||||
;push {r4-r5}
|
||||
stmdb sp!, {r4-r5}
|
||||
|
||||
;preload
|
||||
pld [r0]
|
||||
pld [r0, r1]
|
||||
pld [r0, r1, lsl #1]
|
||||
|
||||
ands r4, r0, #7
|
||||
beq copy_mem8x8_fast
|
||||
|
||||
ands r4, r0, #3
|
||||
beq copy_mem8x8_4
|
||||
|
||||
;copy 1 byte each time
|
||||
ldrb r4, [r0]
|
||||
ldrb r5, [r0, #1]
|
||||
|
||||
mov r12, #8
|
||||
|
||||
copy_mem8x8_1_loop
|
||||
strb r4, [r2]
|
||||
strb r5, [r2, #1]
|
||||
|
||||
ldrb r4, [r0, #2]
|
||||
ldrb r5, [r0, #3]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
strb r4, [r2, #2]
|
||||
strb r5, [r2, #3]
|
||||
|
||||
ldrb r4, [r0, #4]
|
||||
ldrb r5, [r0, #5]
|
||||
|
||||
strb r4, [r2, #4]
|
||||
strb r5, [r2, #5]
|
||||
|
||||
ldrb r4, [r0, #6]
|
||||
ldrb r5, [r0, #7]
|
||||
|
||||
add r0, r0, r1
|
||||
|
||||
strb r4, [r2, #6]
|
||||
strb r5, [r2, #7]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrneb r4, [r0]
|
||||
ldrneb r5, [r0, #1]
|
||||
|
||||
bne copy_mem8x8_1_loop
|
||||
|
||||
ldmia sp!, {r4 - r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 4 bytes each time
|
||||
copy_mem8x8_4
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r0, #4]
|
||||
|
||||
mov r12, #8
|
||||
|
||||
copy_mem8x8_4_loop
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r1
|
||||
|
||||
str r4, [r2]
|
||||
str r5, [r2, #4]
|
||||
|
||||
add r2, r2, r3
|
||||
|
||||
ldrne r4, [r0]
|
||||
ldrne r5, [r0, #4]
|
||||
|
||||
bne copy_mem8x8_4_loop
|
||||
|
||||
ldmia sp!, {r4 - r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
;copy 8 bytes each time
|
||||
copy_mem8x8_fast
|
||||
;sub r1, r1, #8
|
||||
;sub r3, r3, #8
|
||||
|
||||
mov r12, #8
|
||||
|
||||
copy_mem8x8_fast_loop
|
||||
ldmia r0, {r4-r5}
|
||||
;ldm r0, {r4-r5}
|
||||
add r0, r0, r1
|
||||
|
||||
subs r12, r12, #1
|
||||
stmia r2, {r4-r5}
|
||||
;stm r2, {r4-r5}
|
||||
add r2, r2, r3
|
||||
|
||||
bne copy_mem8x8_fast_loop
|
||||
|
||||
ldmia sp!, {r4-r5}
|
||||
;pop {r4-r5}
|
||||
mov pc, lr
|
||||
|
||||
ENDP ; |vp8_copy_mem8x8_v6|
|
||||
|
||||
END
|
@ -1,70 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_dc_only_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr,
|
||||
; int pred_stride, unsigned char *dst_ptr,
|
||||
; int dst_stride)
|
||||
; r0 input_dc
|
||||
; r1 pred_ptr
|
||||
; r2 pred_stride
|
||||
; r3 dst_ptr
|
||||
; sp dst_stride
|
||||
|
||||
|vp8_dc_only_idct_add_v6| PROC
|
||||
stmdb sp!, {r4 - r7}
|
||||
|
||||
add r0, r0, #4 ; input_dc += 4
|
||||
ldr r12, c0x0000FFFF
|
||||
ldr r4, [r1], r2
|
||||
and r0, r12, r0, asr #3 ; input_dc >> 3 + mask
|
||||
ldr r6, [r1], r2
|
||||
orr r0, r0, r0, lsl #16 ; a1 | a1
|
||||
|
||||
ldr r12, [sp, #16] ; dst stride
|
||||
|
||||
uxtab16 r5, r0, r4 ; a1+2 | a1+0
|
||||
uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1
|
||||
uxtab16 r7, r0, r6
|
||||
uxtab16 r6, r0, r6, ror #8
|
||||
usat16 r5, #8, r5
|
||||
usat16 r4, #8, r4
|
||||
usat16 r7, #8, r7
|
||||
usat16 r6, #8, r6
|
||||
orr r5, r5, r4, lsl #8
|
||||
orr r7, r7, r6, lsl #8
|
||||
ldr r4, [r1], r2
|
||||
str r5, [r3], r12
|
||||
ldr r6, [r1]
|
||||
str r7, [r3], r12
|
||||
|
||||
uxtab16 r5, r0, r4
|
||||
uxtab16 r4, r0, r4, ror #8
|
||||
uxtab16 r7, r0, r6
|
||||
uxtab16 r6, r0, r6, ror #8
|
||||
usat16 r5, #8, r5
|
||||
usat16 r4, #8, r4
|
||||
usat16 r7, #8, r7
|
||||
usat16 r6, #8, r6
|
||||
orr r5, r5, r4, lsl #8
|
||||
orr r7, r7, r6, lsl #8
|
||||
str r5, [r3], r12
|
||||
str r7, [r3]
|
||||
|
||||
ldmia sp!, {r4 - r7}
|
||||
bx lr
|
||||
|
||||
ENDP ; |vp8_dc_only_idct_add_v6|
|
||||
|
||||
; Constant Pool
|
||||
c0x0000FFFF DCD 0x0000FFFF
|
||||
END
|
@ -1,190 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license and patent
|
||||
; grant that can be found in the LICENSE file in the root of the source
|
||||
; tree. All contributing project authors may be found in the AUTHORS
|
||||
; file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_dequant_idct_add_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
;void vp8_dequant_idct_v6(short *input, short *dq,
|
||||
; unsigned char *dest, int stride)
|
||||
; r0 = q
|
||||
; r1 = dq
|
||||
; r2 = dst
|
||||
; r3 = stride
|
||||
|
||||
|vp8_dequant_idct_add_v6| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
ldr r4, [r0] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
sub sp, sp, #4
|
||||
str r3, [sp]
|
||||
|
||||
mov r12, #4
|
||||
|
||||
vp8_dequant_add_loop
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
ldr r4, [r0, #4] ;input
|
||||
ldr r5, [r1], #4 ;dq
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
smulbb r6, r4, r5
|
||||
smultt r7, r4, r5
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
ldrne r4, [r0, #4]
|
||||
ldrne r5, [r1], #4
|
||||
|
||||
strh r6, [r0], #2
|
||||
strh r7, [r0], #2
|
||||
|
||||
bne vp8_dequant_add_loop
|
||||
|
||||
sub r0, r0, #32
|
||||
mov r1, r0
|
||||
|
||||
; short_idct4x4llm_v6_dual
|
||||
ldr r3, cospi8sqrt2minus1
|
||||
ldr r4, sinpi8sqrt2
|
||||
ldr r6, [r0, #8]
|
||||
mov r5, #2
|
||||
vp8_dequant_idct_loop1_v6
|
||||
ldr r12, [r0, #24]
|
||||
ldr r14, [r0, #16]
|
||||
smulwt r9, r3, r6
|
||||
smulwb r7, r3, r6
|
||||
smulwt r10, r4, r6
|
||||
smulwb r8, r4, r6
|
||||
pkhbt r7, r7, r9, lsl #16
|
||||
smulwt r11, r3, r12
|
||||
pkhbt r8, r8, r10, lsl #16
|
||||
uadd16 r6, r6, r7
|
||||
smulwt r7, r4, r12
|
||||
smulwb r9, r3, r12
|
||||
smulwb r10, r4, r12
|
||||
subs r5, r5, #1
|
||||
pkhbt r9, r9, r11, lsl #16
|
||||
ldr r11, [r0], #4
|
||||
pkhbt r10, r10, r7, lsl #16
|
||||
uadd16 r7, r12, r9
|
||||
usub16 r7, r8, r7
|
||||
uadd16 r6, r6, r10
|
||||
uadd16 r10, r11, r14
|
||||
usub16 r8, r11, r14
|
||||
uadd16 r9, r10, r6
|
||||
usub16 r10, r10, r6
|
||||
uadd16 r6, r8, r7
|
||||
usub16 r7, r8, r7
|
||||
str r6, [r1, #8]
|
||||
ldrne r6, [r0, #8]
|
||||
str r7, [r1, #16]
|
||||
str r10, [r1, #24]
|
||||
str r9, [r1], #4
|
||||
bne vp8_dequant_idct_loop1_v6
|
||||
|
||||
mov r5, #2
|
||||
sub r0, r1, #8
|
||||
vp8_dequant_idct_loop2_v6
|
||||
ldr r6, [r0], #4
|
||||
ldr r7, [r0], #4
|
||||
ldr r8, [r0], #4
|
||||
ldr r9, [r0], #4
|
||||
smulwt r1, r3, r6
|
||||
smulwt r12, r4, r6
|
||||
smulwt lr, r3, r8
|
||||
smulwt r10, r4, r8
|
||||
pkhbt r11, r8, r6, lsl #16
|
||||
pkhbt r1, lr, r1, lsl #16
|
||||
pkhbt r12, r10, r12, lsl #16
|
||||
pkhtb r6, r6, r8, asr #16
|
||||
uadd16 r6, r1, r6
|
||||
pkhbt lr, r9, r7, lsl #16
|
||||
uadd16 r10, r11, lr
|
||||
usub16 lr, r11, lr
|
||||
pkhtb r8, r7, r9, asr #16
|
||||
subs r5, r5, #1
|
||||
smulwt r1, r3, r8
|
||||
smulwb r7, r3, r8
|
||||
smulwt r11, r4, r8
|
||||
smulwb r9, r4, r8
|
||||
pkhbt r1, r7, r1, lsl #16
|
||||
uadd16 r8, r1, r8
|
||||
pkhbt r11, r9, r11, lsl #16
|
||||
usub16 r1, r12, r8
|
||||
uadd16 r8, r11, r6
|
||||
ldr r9, c0x00040004
|
||||
ldr r12, [sp] ; get stride from stack
|
||||
uadd16 r6, r10, r8
|
||||
usub16 r7, r10, r8
|
||||
uadd16 r7, r7, r9
|
||||
uadd16 r6, r6, r9
|
||||
uadd16 r10, r14, r1
|
||||
usub16 r1, r14, r1
|
||||
uadd16 r10, r10, r9
|
||||
uadd16 r1, r1, r9
|
||||
ldr r11, [r2] ; load input from dst
|
||||
mov r8, r7, asr #3
|
||||
pkhtb r9, r8, r10, asr #19
|
||||
mov r8, r1, asr #3
|
||||
pkhtb r8, r8, r6, asr #19
|
||||
uxtb16 lr, r11, ror #8
|
||||
qadd16 r9, r9, lr
|
||||
uxtb16 lr, r11
|
||||
qadd16 r8, r8, lr
|
||||
usat16 r9, #8, r9
|
||||
usat16 r8, #8, r8
|
||||
orr r9, r8, r9, lsl #8
|
||||
ldr r11, [r2, r12] ; load input from dst
|
||||
mov r7, r7, lsl #16
|
||||
mov r1, r1, lsl #16
|
||||
mov r10, r10, lsl #16
|
||||
mov r6, r6, lsl #16
|
||||
mov r7, r7, asr #3
|
||||
pkhtb r7, r7, r10, asr #19
|
||||
mov r1, r1, asr #3
|
||||
pkhtb r1, r1, r6, asr #19
|
||||
uxtb16 r8, r11, ror #8
|
||||
qadd16 r7, r7, r8
|
||||
uxtb16 r8, r11
|
||||
qadd16 r1, r1, r8
|
||||
usat16 r7, #8, r7
|
||||
usat16 r1, #8, r1
|
||||
orr r1, r1, r7, lsl #8
|
||||
str r9, [r2], r12 ; store output to dst
|
||||
str r1, [r2], r12 ; store output to dst
|
||||
bne vp8_dequant_idct_loop2_v6
|
||||
|
||||
; memset
|
||||
sub r0, r0, #32
|
||||
add sp, sp, #4
|
||||
|
||||
mov r12, #0
|
||||
str r12, [r0]
|
||||
str r12, [r0, #4]
|
||||
str r12, [r0, #8]
|
||||
str r12, [r0, #12]
|
||||
str r12, [r0, #16]
|
||||
str r12, [r0, #20]
|
||||
str r12, [r0, #24]
|
||||
str r12, [r0, #28]
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_dequant_idct_add_v6|
|
||||
|
||||
; Constant Pool
|
||||
cospi8sqrt2minus1 DCD 0x00004E7B
|
||||
sinpi8sqrt2 DCD 0x00008A8C
|
||||
c0x00040004 DCD 0x00040004
|
||||
|
||||
END
|
@ -1,69 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_dequantize_b_loop_v6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
;-------------------------------
|
||||
;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
|
||||
; r0 short *Q,
|
||||
; r1 short *DQC
|
||||
; r2 short *DQ
|
||||
|vp8_dequantize_b_loop_v6| PROC
|
||||
stmdb sp!, {r4-r9, lr}
|
||||
|
||||
ldr r3, [r0] ;load Q
|
||||
ldr r4, [r1] ;load DQC
|
||||
ldr r5, [r0, #4]
|
||||
ldr r6, [r1, #4]
|
||||
|
||||
mov r12, #2 ;loop counter
|
||||
|
||||
dequant_loop
|
||||
smulbb r7, r3, r4 ;multiply
|
||||
smultt r8, r3, r4
|
||||
smulbb r9, r5, r6
|
||||
smultt lr, r5, r6
|
||||
|
||||
ldr r3, [r0, #8]
|
||||
ldr r4, [r1, #8]
|
||||
ldr r5, [r0, #12]
|
||||
ldr r6, [r1, #12]
|
||||
|
||||
strh r7, [r2], #2 ;store result
|
||||
smulbb r7, r3, r4 ;multiply
|
||||
strh r8, [r2], #2
|
||||
smultt r8, r3, r4
|
||||
strh r9, [r2], #2
|
||||
smulbb r9, r5, r6
|
||||
strh lr, [r2], #2
|
||||
smultt lr, r5, r6
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
add r0, r0, #16
|
||||
add r1, r1, #16
|
||||
|
||||
ldrne r3, [r0]
|
||||
strh r7, [r2], #2 ;store result
|
||||
ldrne r4, [r1]
|
||||
strh r8, [r2], #2
|
||||
ldrne r5, [r0, #4]
|
||||
strh r9, [r2], #2
|
||||
ldrne r6, [r1, #4]
|
||||
strh lr, [r2], #2
|
||||
|
||||
bne dequant_loop
|
||||
|
||||
ldmia sp!, {r4-r9, pc}
|
||||
ENDP ;|vp8_dequantize_b_loop_v6|
|
||||
|
||||
END
|
@ -1,624 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_filter_block2d_first_pass_armv6|
|
||||
EXPORT |vp8_filter_block2d_first_pass_16x16_armv6|
|
||||
EXPORT |vp8_filter_block2d_first_pass_8x8_armv6|
|
||||
EXPORT |vp8_filter_block2d_second_pass_armv6|
|
||||
EXPORT |vp8_filter4_block2d_second_pass_armv6|
|
||||
EXPORT |vp8_filter_block2d_first_pass_only_armv6|
|
||||
EXPORT |vp8_filter_block2d_second_pass_only_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 short *output_ptr
|
||||
; r2 unsigned int src_pixels_per_line
|
||||
; r3 unsigned int output_width
|
||||
; stack unsigned int output_height
|
||||
; stack const short *vp8_filter
|
||||
;-------------------------------------
|
||||
; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with
|
||||
; the output being a 2 byte value and the intput being a 1 byte value.
|
||||
|vp8_filter_block2d_first_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp8_filter address
|
||||
ldr r7, [sp, #36] ; output height
|
||||
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
||||
add r12, r3, #16 ; square off the output
|
||||
sub sp, sp, #4
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r1, [sp] ; push destination to stack
|
||||
mov r7, r7, lsl #16 ; height is top part of counter
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_6|
|
||||
ldrb r8, [r0, #-2] ; load source data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
orr r7, r7, r3, lsr #2 ; construct loop counter
|
||||
|
||||
|width_loop_1st_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
smuad lr, lr, r4 ; apply the filter
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
sub r7, r7, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r11, r10, r6, r8
|
||||
|
||||
ands r10, r7, #0xff ; test loop counter
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r11, r11, #0x40
|
||||
ldrneb r9, [r0, #-1]
|
||||
usat r11, #8, r11, asr #7
|
||||
|
||||
strh lr, [r1], r12 ; result is transposed and stored, which
|
||||
; will make second pass filtering easier.
|
||||
ldrneb r10, [r0], #2
|
||||
strh r11, [r1], r12
|
||||
|
||||
bne width_loop_1st_6
|
||||
|
||||
ldr r1, [sp] ; load and update dst address
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, r2 ; move to next input line
|
||||
|
||||
add r1, r1, #2 ; move over to next column
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_1st_6
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; --------------------------
|
||||
; 16x16 version
|
||||
; -----------------------------
|
||||
|vp8_filter_block2d_first_pass_16x16_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp8_filter address
|
||||
ldr r7, [sp, #36] ; output height
|
||||
|
||||
add r4, r2, #18 ; preload next low
|
||||
pld [r0, r4]
|
||||
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
||||
add r12, r3, #16 ; square off the output
|
||||
sub sp, sp, #4
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r1, [sp] ; push destination to stack
|
||||
mov r7, r7, lsl #16 ; height is top part of counter
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_16_6|
|
||||
ldrb r8, [r0, #-2] ; load source data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
orr r7, r7, r3, lsr #2 ; construct loop counter
|
||||
|
||||
|width_loop_1st_16_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
smuad lr, lr, r4 ; apply the filter
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
sub r7, r7, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r11, r10, r6, r8
|
||||
|
||||
ands r10, r7, #0xff ; test loop counter
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r11, r11, #0x40
|
||||
ldrneb r9, [r0, #-1]
|
||||
usat r11, #8, r11, asr #7
|
||||
|
||||
strh lr, [r1], r12 ; result is transposed and stored, which
|
||||
; will make second pass filtering easier.
|
||||
ldrneb r10, [r0], #2
|
||||
strh r11, [r1], r12
|
||||
|
||||
bne width_loop_1st_16_6
|
||||
|
||||
ldr r1, [sp] ; load and update dst address
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, r2 ; move to next input line
|
||||
|
||||
add r11, r2, #34 ; adding back block width(=16)
|
||||
pld [r0, r11] ; preload next low
|
||||
|
||||
add r1, r1, #2 ; move over to next column
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_1st_16_6
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; --------------------------
|
||||
; 8x8 version
|
||||
; -----------------------------
|
||||
|vp8_filter_block2d_first_pass_8x8_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vp8_filter address
|
||||
ldr r7, [sp, #36] ; output height
|
||||
|
||||
add r4, r2, #10 ; preload next low
|
||||
pld [r0, r4]
|
||||
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
mov r3, r3, lsl #1 ; multiply width by 2 because using shorts
|
||||
add r12, r3, #16 ; square off the output
|
||||
sub sp, sp, #4
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r1, [sp] ; push destination to stack
|
||||
mov r7, r7, lsl #16 ; height is top part of counter
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_8_6|
|
||||
ldrb r8, [r0, #-2] ; load source data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
orr r7, r7, r3, lsr #2 ; construct loop counter
|
||||
|
||||
|width_loop_1st_8_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
smuad lr, lr, r4 ; apply the filter
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
sub r7, r7, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r11, r10, r6, r8
|
||||
|
||||
ands r10, r7, #0xff ; test loop counter
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r11, r11, #0x40
|
||||
ldrneb r9, [r0, #-1]
|
||||
usat r11, #8, r11, asr #7
|
||||
|
||||
strh lr, [r1], r12 ; result is transposed and stored, which
|
||||
; will make second pass filtering easier.
|
||||
ldrneb r10, [r0], #2
|
||||
strh r11, [r1], r12
|
||||
|
||||
bne width_loop_1st_8_6
|
||||
|
||||
ldr r1, [sp] ; load and update dst address
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, r2 ; move to next input line
|
||||
|
||||
add r11, r2, #18 ; adding back block width(=8)
|
||||
pld [r0, r11] ; preload next low
|
||||
|
||||
add r1, r1, #2 ; move over to next column
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_1st_8_6
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;---------------------------------
|
||||
; r0 short *src_ptr,
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int output_pitch,
|
||||
; r3 unsigned int cnt,
|
||||
; stack const short *vp8_filter
|
||||
;---------------------------------
|
||||
|vp8_filter_block2d_second_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #36] ; vp8_filter address
|
||||
sub sp, sp, #4
|
||||
mov r7, r3, lsl #16 ; height is top part of counter
|
||||
str r1, [sp] ; push destination to stack
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
pkhbt r12, r5, r4 ; pack the filter differently
|
||||
pkhbt r11, r6, r5
|
||||
|
||||
sub r0, r0, #4 ; offset input buffer
|
||||
|
||||
|height_loop_2nd|
|
||||
ldr r8, [r0] ; load the data
|
||||
ldr r9, [r0, #4]
|
||||
orr r7, r7, r3, lsr #1 ; loop counter
|
||||
|
||||
|width_loop_2nd|
|
||||
smuad lr, r4, r8 ; apply filter
|
||||
sub r7, r7, #1
|
||||
smulbt r8, r4, r8
|
||||
|
||||
ldr r10, [r0, #8]
|
||||
|
||||
smlad lr, r5, r9, lr
|
||||
smladx r8, r12, r9, r8
|
||||
|
||||
ldrh r9, [r0, #12]
|
||||
|
||||
smlad lr, r6, r10, lr
|
||||
smladx r8, r11, r10, r8
|
||||
|
||||
add r0, r0, #4
|
||||
smlatb r10, r6, r9, r8
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ands r8, r7, #0xff
|
||||
usat lr, #8, lr, asr #7
|
||||
add r10, r10, #0x40
|
||||
strb lr, [r1], r2 ; the result is transposed back and stored
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
ldrne r8, [r0] ; load data for next loop
|
||||
ldrne r9, [r0, #4]
|
||||
strb r10, [r1], r2
|
||||
|
||||
bne width_loop_2nd
|
||||
|
||||
ldr r1, [sp] ; update dst for next loop
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, #16 ; updata src for next loop
|
||||
add r1, r1, #1
|
||||
str r1, [sp]
|
||||
|
||||
bne height_loop_2nd
|
||||
|
||||
add sp, sp, #4
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;---------------------------------
|
||||
; r0 short *src_ptr,
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int output_pitch,
|
||||
; r3 unsigned int cnt,
|
||||
; stack const short *vp8_filter
|
||||
;---------------------------------
|
||||
|vp8_filter4_block2d_second_pass_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #36] ; vp8_filter address
|
||||
mov r7, r3, lsl #16 ; height is top part of counter
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
add lr, r1, r3 ; save final destination pointer
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
pkhbt r12, r5, r4 ; pack the filter differently
|
||||
pkhbt r11, r6, r5
|
||||
mov r4, #0x40 ; rounding factor (for smlad{x})
|
||||
|
||||
|height_loop_2nd_4|
|
||||
ldrd r8, r9, [r0, #-4] ; load the data
|
||||
orr r7, r7, r3, lsr #1 ; loop counter
|
||||
|
||||
|width_loop_2nd_4|
|
||||
ldr r10, [r0, #4]!
|
||||
smladx r6, r9, r12, r4 ; apply filter
|
||||
pkhbt r8, r9, r8
|
||||
smlad r5, r8, r12, r4
|
||||
pkhbt r8, r10, r9
|
||||
smladx r6, r10, r11, r6
|
||||
sub r7, r7, #1
|
||||
smlad r5, r8, r11, r5
|
||||
|
||||
mov r8, r9 ; shift the data for the next loop
|
||||
mov r9, r10
|
||||
|
||||
usat r6, #8, r6, asr #7 ; shift and clamp
|
||||
usat r5, #8, r5, asr #7
|
||||
|
||||
strb r5, [r1], r2 ; the result is transposed back and stored
|
||||
tst r7, #0xff
|
||||
strb r6, [r1], r2
|
||||
|
||||
bne width_loop_2nd_4
|
||||
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, #16 ; update src for next loop
|
||||
sub r1, lr, r7, lsr #16 ; update dst for next loop
|
||||
|
||||
bne height_loop_2nd_4
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;------------------------------------
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int src_pixels_per_line
|
||||
; r3 unsigned int cnt,
|
||||
; stack unsigned int output_pitch,
|
||||
; stack const short *vp8_filter
|
||||
;------------------------------------
|
||||
|vp8_filter_block2d_first_pass_only_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
add r7, r2, r3 ; preload next low
|
||||
add r7, r7, #2
|
||||
pld [r0, r7]
|
||||
|
||||
ldr r4, [sp, #36] ; output pitch
|
||||
ldr r11, [sp, #40] ; HFilter address
|
||||
sub sp, sp, #8
|
||||
|
||||
mov r7, r3
|
||||
sub r2, r2, r3 ; inside loop increments input array,
|
||||
; so the height loop only needs to add
|
||||
; r2 - width to the input pointer
|
||||
|
||||
sub r4, r4, r3
|
||||
str r4, [sp] ; save modified output pitch
|
||||
str r2, [sp, #4]
|
||||
|
||||
mov r2, #0x40
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
; six tap filter
|
||||
|height_loop_1st_only_6|
|
||||
ldrb r8, [r0, #-2] ; load data
|
||||
ldrb r9, [r0, #-1]
|
||||
ldrb r10, [r0], #2
|
||||
|
||||
mov r12, r3, lsr #1 ; loop counter
|
||||
|
||||
|width_loop_1st_only_6|
|
||||
ldrb r11, [r0, #-1]
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0]
|
||||
|
||||
;; smuad lr, lr, r4
|
||||
smlad lr, lr, r4, r2
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
;; smuad r8, r8, r4
|
||||
smlad r8, r8, r4, r2
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0, #1]
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0, #2]
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r10, r10, r6, r8
|
||||
|
||||
;; add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0, #-2] ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
;; add r10, r10, #0x40
|
||||
strb lr, [r1], #1 ; store the result
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
ldrneb r9, [r0, #-1]
|
||||
strb r10, [r1], #1
|
||||
ldrneb r10, [r0], #2
|
||||
|
||||
bne width_loop_1st_only_6
|
||||
|
||||
ldr lr, [sp] ; load back output pitch
|
||||
ldr r12, [sp, #4] ; load back output pitch
|
||||
subs r7, r7, #1
|
||||
add r0, r0, r12 ; updata src for next loop
|
||||
|
||||
add r11, r12, r3 ; preload next low
|
||||
add r11, r11, #2
|
||||
pld [r0, r11]
|
||||
|
||||
add r1, r1, lr ; update dst for next loop
|
||||
|
||||
bne height_loop_1st_only_6
|
||||
|
||||
add sp, sp, #8
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_filter_block2d_first_pass_only_armv6|
|
||||
|
||||
|
||||
;------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned char *output_ptr,
|
||||
; r2 unsigned int src_pixels_per_line
|
||||
; r3 unsigned int cnt,
|
||||
; stack unsigned int output_pitch,
|
||||
; stack const short *vp8_filter
|
||||
;------------------------------------
|
||||
|vp8_filter_block2d_second_pass_only_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; VFilter address
|
||||
ldr r12, [sp, #36] ; output pitch
|
||||
|
||||
mov r7, r3, lsl #16 ; height is top part of counter
|
||||
sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after
|
||||
|
||||
sub sp, sp, #8
|
||||
|
||||
ldr r4, [r11] ; load up packed filter coefficients
|
||||
ldr r5, [r11, #4]
|
||||
ldr r6, [r11, #8]
|
||||
|
||||
str r0, [sp] ; save r0 to stack
|
||||
str r1, [sp, #4] ; save dst to stack
|
||||
|
||||
; six tap filter
|
||||
|width_loop_2nd_only_6|
|
||||
ldrb r8, [r0], r2 ; load data
|
||||
orr r7, r7, r3 ; loop counter
|
||||
ldrb r9, [r0], r2
|
||||
ldrb r10, [r0], r2
|
||||
|
||||
|height_loop_2nd_only_6|
|
||||
; filter first column in this inner loop, than, move to next colum.
|
||||
ldrb r11, [r0], r2
|
||||
|
||||
pkhbt lr, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r8, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
ldrb r9, [r0], r2
|
||||
|
||||
smuad lr, lr, r4
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
smuad r8, r8, r4
|
||||
pkhbt r11, r11, r9, lsl #16 ; r9 | r11
|
||||
|
||||
smlad lr, r10, r5, lr
|
||||
ldrb r10, [r0], r2
|
||||
smlad r8, r11, r5, r8
|
||||
ldrb r11, [r0]
|
||||
|
||||
sub r7, r7, #2
|
||||
sub r0, r0, r2, lsl #2
|
||||
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
pkhbt r10, r10, r11, lsl #16 ; r11 | r10
|
||||
|
||||
smlad lr, r9, r6, lr
|
||||
smlad r10, r10, r6, r8
|
||||
|
||||
ands r9, r7, #0xff
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
ldrneb r8, [r0], r2 ; load data for next loop
|
||||
usat lr, #8, lr, asr #7
|
||||
add r10, r10, #0x40
|
||||
strb lr, [r1], r12 ; store the result for the column
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
ldrneb r9, [r0], r2
|
||||
strb r10, [r1], r12
|
||||
ldrneb r10, [r0], r2
|
||||
|
||||
bne height_loop_2nd_only_6
|
||||
|
||||
ldr r0, [sp]
|
||||
ldr r1, [sp, #4]
|
||||
subs r7, r7, #0x10000
|
||||
add r0, r0, #1 ; move to filter next column
|
||||
str r0, [sp]
|
||||
add r1, r1, #1
|
||||
str r1, [sp, #4]
|
||||
|
||||
bne width_loop_2nd_only_6
|
||||
|
||||
add sp, sp, #8
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_filter_block2d_second_pass_only_armv6|
|
||||
|
||||
END
|
@ -1,100 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
|
||||
void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst,
|
||||
int stride, char *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 4; ++i) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6(q, dq, dst, stride);
|
||||
else if (eobs[0] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[0] * dq[0], dst, stride, dst, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 16, dq, dst + 4, stride);
|
||||
else if (eobs[1] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[16] * dq[0], dst + 4, stride, dst + 4, stride);
|
||||
((int *)(q + 16))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[2] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 32, dq, dst + 8, stride);
|
||||
else if (eobs[2] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[32] * dq[0], dst + 8, stride, dst + 8, stride);
|
||||
((int *)(q + 32))[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[3] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 48, dq, dst + 12, stride);
|
||||
else if (eobs[3] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[48] * dq[0], dst + 12, stride, dst + 12,
|
||||
stride);
|
||||
((int *)(q + 48))[0] = 0;
|
||||
}
|
||||
|
||||
q += 64;
|
||||
dst += 4 * stride;
|
||||
eobs += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dstu,
|
||||
unsigned char *dstv, int stride,
|
||||
char *eobs) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6(q, dq, dstu, stride);
|
||||
else if (eobs[0] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[0] * dq[0], dstu, stride, dstu, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 16, dq, dstu + 4, stride);
|
||||
else if (eobs[1] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[16] * dq[0], dstu + 4, stride, dstu + 4,
|
||||
stride);
|
||||
((int *)(q + 16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
dstu += 4 * stride;
|
||||
eobs += 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
if (eobs[0] > 1)
|
||||
vp8_dequant_idct_add_v6(q, dq, dstv, stride);
|
||||
else if (eobs[0] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[0] * dq[0], dstv, stride, dstv, stride);
|
||||
((int *)q)[0] = 0;
|
||||
}
|
||||
|
||||
if (eobs[1] > 1)
|
||||
vp8_dequant_idct_add_v6(q + 16, dq, dstv + 4, stride);
|
||||
else if (eobs[1] == 1) {
|
||||
vp8_dc_only_idct_add_v6(q[16] * dq[0], dstv + 4, stride, dstv + 4,
|
||||
stride);
|
||||
((int *)(q + 16))[0] = 0;
|
||||
}
|
||||
|
||||
q += 32;
|
||||
dstv += 4 * stride;
|
||||
eobs += 2;
|
||||
}
|
||||
}
|
@ -1,202 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_short_idct4x4llm_v6_dual|
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
|
||||
|
||||
; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
|
||||
; unsigned char *dst, int stride)
|
||||
; r0 short* input
|
||||
; r1 unsigned char* pred
|
||||
; r2 int pitch
|
||||
; r3 unsigned char* dst
|
||||
; sp int stride
|
||||
|
||||
|vp8_short_idct4x4llm_v6_dual| PROC
|
||||
stmdb sp!, {r4-r11, lr}
|
||||
|
||||
sub sp, sp, #4
|
||||
|
||||
mov r4, #0x00008A00 ; sin
|
||||
orr r4, r4, #0x0000008C ; sinpi8sqrt2
|
||||
|
||||
mov r5, #0x00004E00 ; cos
|
||||
orr r5, r5, #0x0000007B ; cospi8sqrt2minus1
|
||||
orr r5, r5, #1<<31 ; loop counter on top bit
|
||||
|
||||
loop1_dual
|
||||
ldr r6, [r0, #(4*2)] ; i5 | i4
|
||||
ldr r12, [r0, #(12*2)] ; i13|i12
|
||||
ldr r14, [r0, #(8*2)] ; i9 | i8
|
||||
|
||||
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
|
||||
smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
|
||||
smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16
|
||||
|
||||
smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16
|
||||
pkhtb r7, r9, r7, asr #16 ; 5c | 4c
|
||||
pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
|
||||
uadd16 r6, r6, r7 ; 5c+5 | 4c+4
|
||||
|
||||
smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16
|
||||
smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16
|
||||
smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16
|
||||
|
||||
subs r5, r5, #1<<31 ; i--
|
||||
|
||||
pkhtb r9, r11, r9, asr #16 ; 13c | 12c
|
||||
ldr r11, [r0] ; i1 | i0
|
||||
pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
|
||||
uadd16 r7, r12, r9 ; 13c+13 | 12c+12
|
||||
|
||||
usub16 r7, r8, r7 ; c
|
||||
uadd16 r6, r6, r10 ; d
|
||||
uadd16 r10, r11, r14 ; a
|
||||
usub16 r8, r11, r14 ; b
|
||||
|
||||
uadd16 r9, r10, r6 ; a+d
|
||||
usub16 r10, r10, r6 ; a-d
|
||||
uadd16 r6, r8, r7 ; b+c
|
||||
usub16 r7, r8, r7 ; b-c
|
||||
|
||||
; use input buffer to store intermediate results
|
||||
str r6, [r0, #(4*2)] ; o5 | o4
|
||||
str r7, [r0, #(8*2)] ; o9 | o8
|
||||
str r10,[r0, #(12*2)] ; o13|o12
|
||||
str r9, [r0], #4 ; o1 | o0
|
||||
|
||||
bcs loop1_dual
|
||||
|
||||
sub r0, r0, #8 ; reset input/output
|
||||
str r0, [sp]
|
||||
|
||||
loop2_dual
|
||||
|
||||
ldr r6, [r0, #(4*2)] ; i5 | i4
|
||||
ldr r12,[r0, #(2*2)] ; i3 | i2
|
||||
ldr r14,[r0, #(6*2)] ; i7 | i6
|
||||
ldr r0, [r0, #(0*2)] ; i1 | i0
|
||||
|
||||
smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16
|
||||
smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16
|
||||
smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16
|
||||
smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16
|
||||
|
||||
pkhbt r11, r6, r0, lsl #16 ; i0 | i4
|
||||
pkhtb r7, r7, r9, asr #16 ; 1c | 5c
|
||||
pkhtb r0, r0, r6, asr #16 ; i1 | i5
|
||||
pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1
|
||||
|
||||
uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2
|
||||
pkhbt r9, r14, r12, lsl #16 ; i2 | i6
|
||||
uadd16 r10, r11, r9 ; a
|
||||
usub16 r9, r11, r9 ; b
|
||||
pkhtb r6, r12, r14, asr #16 ; i3 | i7
|
||||
|
||||
subs r5, r5, #1<<31 ; i--
|
||||
|
||||
smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16
|
||||
smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16
|
||||
smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16
|
||||
smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16
|
||||
|
||||
pkhtb r7, r7, r12, asr #16 ; 3c | 7c
|
||||
pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1
|
||||
|
||||
uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2
|
||||
usub16 r12, r8, r6 ; c (o1 | o5)
|
||||
uadd16 r6, r11, r0 ; d (o3 | o7)
|
||||
uadd16 r7, r10, r6 ; a+d
|
||||
|
||||
mov r8, #4 ; set up 4's
|
||||
orr r8, r8, #0x40000 ; 4|4
|
||||
|
||||
usub16 r6, r10, r6 ; a-d
|
||||
uadd16 r6, r6, r8 ; a-d+4, 3|7
|
||||
uadd16 r7, r7, r8 ; a+d+4, 0|4
|
||||
uadd16 r10, r9, r12 ; b+c
|
||||
usub16 r0, r9, r12 ; b-c
|
||||
uadd16 r10, r10, r8 ; b+c+4, 1|5
|
||||
uadd16 r8, r0, r8 ; b-c+4, 2|6
|
||||
|
||||
ldr lr, [sp, #40] ; dst stride
|
||||
|
||||
ldrb r0, [r1] ; pred p0
|
||||
ldrb r11, [r1, #1] ; pred p1
|
||||
ldrb r12, [r1, #2] ; pred p2
|
||||
|
||||
add r0, r0, r7, asr #19 ; p0 + o0
|
||||
add r11, r11, r10, asr #19 ; p1 + o1
|
||||
add r12, r12, r8, asr #19 ; p2 + o2
|
||||
|
||||
usat r0, #8, r0 ; d0 = clip8(p0 + o0)
|
||||
usat r11, #8, r11 ; d1 = clip8(p1 + o1)
|
||||
usat r12, #8, r12 ; d2 = clip8(p2 + o2)
|
||||
|
||||
add r0, r0, r11, lsl #8 ; |--|--|d1|d0|
|
||||
|
||||
ldrb r11, [r1, #3] ; pred p3
|
||||
|
||||
add r0, r0, r12, lsl #16 ; |--|d2|d1|d0|
|
||||
|
||||
add r11, r11, r6, asr #19 ; p3 + o3
|
||||
|
||||
sxth r7, r7 ;
|
||||
sxth r10, r10 ;
|
||||
|
||||
usat r11, #8, r11 ; d3 = clip8(p3 + o3)
|
||||
|
||||
sxth r8, r8 ;
|
||||
sxth r6, r6 ;
|
||||
|
||||
add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0|
|
||||
|
||||
ldrb r12, [r1, r2]! ; pred p4
|
||||
str r0, [r3], lr
|
||||
ldrb r11, [r1, #1] ; pred p5
|
||||
|
||||
add r12, r12, r7, asr #3 ; p4 + o4
|
||||
add r11, r11, r10, asr #3 ; p5 + o5
|
||||
|
||||
usat r12, #8, r12 ; d4 = clip8(p4 + o4)
|
||||
usat r11, #8, r11 ; d5 = clip8(p5 + o5)
|
||||
|
||||
ldrb r7, [r1, #2] ; pred p6
|
||||
ldrb r10, [r1, #3] ; pred p6
|
||||
|
||||
add r12, r12, r11, lsl #8 ; |--|--|d5|d4|
|
||||
|
||||
add r7, r7, r8, asr #3 ; p6 + o6
|
||||
add r10, r10, r6, asr #3 ; p7 + o7
|
||||
|
||||
ldr r0, [sp] ; load input pointer
|
||||
|
||||
usat r7, #8, r7 ; d6 = clip8(p6 + o6)
|
||||
usat r10, #8, r10 ; d7 = clip8(p7 + o7)
|
||||
|
||||
add r12, r12, r7, lsl #16 ; |--|d6|d5|d4|
|
||||
add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4|
|
||||
|
||||
str r12, [r3], lr
|
||||
add r0, r0, #16
|
||||
add r1, r1, r2 ; pred + pitch
|
||||
|
||||
bcs loop2_dual
|
||||
|
||||
add sp, sp, #4 ; idct_output buffer
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
@ -1,136 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_inv_walsh4x4_v6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff)
|
||||
|vp8_short_inv_walsh4x4_v6| PROC
|
||||
|
||||
stmdb sp!, {r4 - r12, lr}
|
||||
|
||||
ldr r2, [r0, #0] ; [1 | 0]
|
||||
ldr r3, [r0, #4] ; [3 | 2]
|
||||
ldr r4, [r0, #8] ; [5 | 4]
|
||||
ldr r5, [r0, #12] ; [7 | 6]
|
||||
ldr r6, [r0, #16] ; [9 | 8]
|
||||
ldr r7, [r0, #20] ; [11 | 10]
|
||||
ldr r8, [r0, #24] ; [13 | 12]
|
||||
ldr r9, [r0, #28] ; [15 | 14]
|
||||
|
||||
qadd16 r10, r2, r8 ; a1 [1+13 | 0+12]
|
||||
qadd16 r11, r4, r6 ; b1 [5+9 | 4+8]
|
||||
qsub16 r12, r4, r6 ; c1 [5-9 | 4-8]
|
||||
qsub16 lr, r2, r8 ; d1 [1-13 | 0-12]
|
||||
|
||||
qadd16 r2, r10, r11 ; a1 + b1 [1 | 0]
|
||||
qadd16 r4, r12, lr ; c1 + d1 [5 | 4]
|
||||
qsub16 r6, r10, r11 ; a1 - b1 [9 | 8]
|
||||
qsub16 r8, lr, r12 ; d1 - c1 [13 | 12]
|
||||
|
||||
qadd16 r10, r3, r9 ; a1 [3+15 | 2+14]
|
||||
qadd16 r11, r5, r7 ; b1 [7+11 | 6+10]
|
||||
qsub16 r12, r5, r7 ; c1 [7-11 | 6-10]
|
||||
qsub16 lr, r3, r9 ; d1 [3-15 | 2-14]
|
||||
|
||||
qadd16 r3, r10, r11 ; a1 + b1 [3 | 2]
|
||||
qadd16 r5, r12, lr ; c1 + d1 [7 | 6]
|
||||
qsub16 r7, r10, r11 ; a1 - b1 [11 | 10]
|
||||
qsub16 r9, lr, r12 ; d1 - c1 [15 | 14]
|
||||
|
||||
; first transform complete
|
||||
|
||||
qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3]
|
||||
qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3]
|
||||
qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7]
|
||||
qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7]
|
||||
|
||||
qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
ldr r10, c0x00030003
|
||||
qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
|
||||
qadd16 r2, r2, r10 ; [b2+3|c2+3]
|
||||
qadd16 r3, r3, r10 ; [a2+3|d2+3]
|
||||
qadd16 r4, r4, r10 ; [b2+3|c2+3]
|
||||
qadd16 r5, r5, r10 ; [a2+3|d2+3]
|
||||
|
||||
asr r12, r3, #19 ; [0]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r2, #19 ; [1]
|
||||
strh lr, [r1], #32
|
||||
sxth r2, r2
|
||||
sxth r3, r3
|
||||
asr r2, r2, #3 ; [2]
|
||||
strh r2, [r1], #32
|
||||
asr r3, r3, #3 ; [3]
|
||||
strh r3, [r1], #32
|
||||
|
||||
asr r12, r5, #19 ; [4]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r4, #19 ; [5]
|
||||
strh lr, [r1], #32
|
||||
sxth r4, r4
|
||||
sxth r5, r5
|
||||
asr r4, r4, #3 ; [6]
|
||||
strh r4, [r1], #32
|
||||
asr r5, r5, #3 ; [7]
|
||||
strh r5, [r1], #32
|
||||
|
||||
qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11]
|
||||
qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11]
|
||||
qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15]
|
||||
qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15]
|
||||
|
||||
qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1]
|
||||
qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1]
|
||||
|
||||
qadd16 r6, r6, r10 ; [b2+3|c2+3]
|
||||
qadd16 r7, r7, r10 ; [a2+3|d2+3]
|
||||
qadd16 r8, r8, r10 ; [b2+3|c2+3]
|
||||
qadd16 r9, r9, r10 ; [a2+3|d2+3]
|
||||
|
||||
asr r12, r7, #19 ; [8]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r6, #19 ; [9]
|
||||
strh lr, [r1], #32
|
||||
sxth r6, r6
|
||||
sxth r7, r7
|
||||
asr r6, r6, #3 ; [10]
|
||||
strh r6, [r1], #32
|
||||
asr r7, r7, #3 ; [11]
|
||||
strh r7, [r1], #32
|
||||
|
||||
asr r12, r9, #19 ; [12]
|
||||
strh r12, [r1], #32
|
||||
asr lr, r8, #19 ; [13]
|
||||
strh lr, [r1], #32
|
||||
sxth r8, r8
|
||||
sxth r9, r9
|
||||
asr r8, r8, #3 ; [14]
|
||||
strh r8, [r1], #32
|
||||
asr r9, r9, #3 ; [15]
|
||||
strh r9, [r1], #32
|
||||
|
||||
ldmia sp!, {r4 - r12, pc}
|
||||
ENDP ; |vp8_short_inv_walsh4x4_v6|
|
||||
|
||||
|
||||
; Constant Pool
|
||||
c0x00030003 DCD 0x00030003
|
||||
END
|
File diff suppressed because it is too large
Load Diff
@ -1,286 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6|
|
||||
EXPORT |vp8_loop_filter_simple_vertical_edge_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
MACRO
|
||||
TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3
|
||||
; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3
|
||||
; a0: 03 02 01 00
|
||||
; a1: 13 12 11 10
|
||||
; a2: 23 22 21 20
|
||||
; a3: 33 32 31 30
|
||||
; b3 b2 b1 b0
|
||||
|
||||
uxtb16 $b1, $a1 ; xx 12 xx 10
|
||||
uxtb16 $b0, $a0 ; xx 02 xx 00
|
||||
uxtb16 $b3, $a3 ; xx 32 xx 30
|
||||
uxtb16 $b2, $a2 ; xx 22 xx 20
|
||||
orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00
|
||||
orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20
|
||||
|
||||
uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11
|
||||
uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31
|
||||
uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01
|
||||
uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21
|
||||
orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01
|
||||
orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21
|
||||
|
||||
pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1
|
||||
pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3
|
||||
|
||||
pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0
|
||||
pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2
|
||||
MEND
|
||||
|
||||
|
||||
|
||||
src RN r0
|
||||
pstep RN r1
|
||||
|
||||
;r0 unsigned char *src_ptr,
|
||||
;r1 int src_pixel_step,
|
||||
;r2 const char *blimit
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldrb r12, [r2] ; blimit
|
||||
ldr r3, [src, -pstep, lsl #1] ; p1
|
||||
ldr r4, [src, -pstep] ; p0
|
||||
ldr r5, [src] ; q0
|
||||
ldr r6, [src, pstep] ; q1
|
||||
orr r12, r12, r12, lsl #8 ; blimit
|
||||
ldr r2, c0x80808080
|
||||
orr r12, r12, r12, lsl #16 ; blimit
|
||||
mov r9, #4 ; double the count. we're doing 4 at a time
|
||||
mov lr, #0 ; need 0 in a couple places
|
||||
|
||||
|simple_hnext8|
|
||||
; vp8_simple_filter_mask()
|
||||
|
||||
uqsub8 r7, r3, r6 ; p1 - q1
|
||||
uqsub8 r8, r6, r3 ; q1 - p1
|
||||
uqsub8 r10, r4, r5 ; p0 - q0
|
||||
uqsub8 r11, r5, r4 ; q0 - p0
|
||||
orr r8, r8, r7 ; abs(p1 - q1)
|
||||
orr r10, r10, r11 ; abs(p0 - q0)
|
||||
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
|
||||
uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
|
||||
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
|
||||
mvn r8, #0
|
||||
usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
|
||||
sel r10, r8, lr ; filter mask: F or 0
|
||||
cmp r10, #0
|
||||
beq simple_hskip_filter ; skip filtering if all masks are 0x00
|
||||
|
||||
;vp8_simple_filter()
|
||||
|
||||
eor r3, r3, r2 ; p1 offset to convert to a signed value
|
||||
eor r6, r6, r2 ; q1 offset to convert to a signed value
|
||||
eor r4, r4, r2 ; p0 offset to convert to a signed value
|
||||
eor r5, r5, r2 ; q0 offset to convert to a signed value
|
||||
|
||||
qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
|
||||
qsub8 r6, r5, r4 ; q0 - p0
|
||||
qadd8 r3, r3, r6 ; += q0 - p0
|
||||
ldr r7, c0x04040404
|
||||
qadd8 r3, r3, r6 ; += q0 - p0
|
||||
ldr r8, c0x03030303
|
||||
qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
|
||||
;STALL
|
||||
and r3, r3, r10 ; vp8_filter &= mask
|
||||
|
||||
qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4
|
||||
qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3
|
||||
|
||||
shadd8 r7 , r7 , lr
|
||||
shadd8 r8 , r8 , lr
|
||||
shadd8 r7 , r7 , lr
|
||||
shadd8 r8 , r8 , lr
|
||||
shadd8 r7 , r7 , lr ; Filter1 >>= 3
|
||||
shadd8 r8 , r8 , lr ; Filter2 >>= 3
|
||||
|
||||
qsub8 r5 ,r5, r7 ; u = q0 - Filter1
|
||||
qadd8 r4, r4, r8 ; u = p0 + Filter2
|
||||
eor r5, r5, r2 ; *oq0 = u^0x80
|
||||
str r5, [src] ; store oq0 result
|
||||
eor r4, r4, r2 ; *op0 = u^0x80
|
||||
str r4, [src, -pstep] ; store op0 result
|
||||
|
||||
|simple_hskip_filter|
|
||||
subs r9, r9, #1
|
||||
addne src, src, #4 ; next row
|
||||
|
||||
ldrne r3, [src, -pstep, lsl #1] ; p1
|
||||
ldrne r4, [src, -pstep] ; p0
|
||||
ldrne r5, [src] ; q0
|
||||
ldrne r6, [src, pstep] ; q1
|
||||
|
||||
bne simple_hnext8
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6|
|
||||
|
||||
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
|vp8_loop_filter_simple_vertical_edge_armv6| PROC
|
||||
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldrb r12, [r2] ; r12: blimit
|
||||
ldr r2, c0x80808080
|
||||
orr r12, r12, r12, lsl #8
|
||||
|
||||
; load soure data to r7, r8, r9, r10
|
||||
ldrh r3, [src, #-2]
|
||||
pld [src, #23] ; preload for next block
|
||||
ldrh r4, [src], pstep
|
||||
orr r12, r12, r12, lsl #16
|
||||
|
||||
ldrh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrh r6, [src], pstep
|
||||
|
||||
pkhbt r7, r3, r4, lsl #16
|
||||
|
||||
ldrh r3, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrh r4, [src], pstep
|
||||
|
||||
pkhbt r8, r5, r6, lsl #16
|
||||
|
||||
ldrh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrh r6, [src], pstep
|
||||
mov r11, #4 ; double the count. we're doing 4 at a time
|
||||
|
||||
|simple_vnext8|
|
||||
; vp8_simple_filter_mask() function
|
||||
pkhbt r9, r3, r4, lsl #16
|
||||
pkhbt r10, r5, r6, lsl #16
|
||||
|
||||
;transpose r7, r8, r9, r10 to r3, r4, r5, r6
|
||||
TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6
|
||||
|
||||
uqsub8 r7, r3, r6 ; p1 - q1
|
||||
uqsub8 r8, r6, r3 ; q1 - p1
|
||||
uqsub8 r9, r4, r5 ; p0 - q0
|
||||
uqsub8 r10, r5, r4 ; q0 - p0
|
||||
orr r7, r7, r8 ; abs(p1 - q1)
|
||||
orr r9, r9, r10 ; abs(p0 - q0)
|
||||
mov r8, #0
|
||||
uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
|
||||
uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
|
||||
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
|
||||
mvn r10, #0 ; r10 == -1
|
||||
|
||||
usub8 r7, r12, r7 ; compare to flimit
|
||||
sel lr, r10, r8 ; filter mask
|
||||
|
||||
cmp lr, #0
|
||||
beq simple_vskip_filter ; skip filtering
|
||||
|
||||
;vp8_simple_filter() function
|
||||
eor r3, r3, r2 ; p1 offset to convert to a signed value
|
||||
eor r6, r6, r2 ; q1 offset to convert to a signed value
|
||||
eor r4, r4, r2 ; p0 offset to convert to a signed value
|
||||
eor r5, r5, r2 ; q0 offset to convert to a signed value
|
||||
|
||||
qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
|
||||
qsub8 r6, r5, r4 ; q0 - p0
|
||||
|
||||
qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
|
||||
ldr r9, c0x03030303 ; r9 = 3
|
||||
|
||||
qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
|
||||
ldr r7, c0x04040404
|
||||
|
||||
qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
|
||||
;STALL
|
||||
and r3, r3, lr ; vp8_filter &= mask
|
||||
|
||||
qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3
|
||||
qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4
|
||||
|
||||
shadd8 r9 , r9 , r8
|
||||
shadd8 r3 , r3 , r8
|
||||
shadd8 r9 , r9 , r8
|
||||
shadd8 r3 , r3 , r8
|
||||
shadd8 r9 , r9 , r8 ; Filter2 >>= 3
|
||||
shadd8 r3 , r3 , r8 ; Filter1 >>= 3
|
||||
|
||||
;calculate output
|
||||
sub src, src, pstep, lsl #2
|
||||
|
||||
qadd8 r4, r4, r9 ; u = p0 + Filter2
|
||||
qsub8 r5, r5, r3 ; u = q0 - Filter1
|
||||
eor r4, r4, r2 ; *op0 = u^0x80
|
||||
eor r5, r5, r2 ; *oq0 = u^0x80
|
||||
|
||||
strb r4, [src, #-1] ; store the result
|
||||
mov r4, r4, lsr #8
|
||||
strb r5, [src], pstep
|
||||
mov r5, r5, lsr #8
|
||||
|
||||
strb r4, [src, #-1]
|
||||
mov r4, r4, lsr #8
|
||||
strb r5, [src], pstep
|
||||
mov r5, r5, lsr #8
|
||||
|
||||
strb r4, [src, #-1]
|
||||
mov r4, r4, lsr #8
|
||||
strb r5, [src], pstep
|
||||
mov r5, r5, lsr #8
|
||||
|
||||
strb r4, [src, #-1]
|
||||
strb r5, [src], pstep
|
||||
|
||||
|simple_vskip_filter|
|
||||
subs r11, r11, #1
|
||||
|
||||
; load soure data to r7, r8, r9, r10
|
||||
ldrneh r3, [src, #-2]
|
||||
pld [src, #23] ; preload for next block
|
||||
ldrneh r4, [src], pstep
|
||||
|
||||
ldrneh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrneh r6, [src], pstep
|
||||
|
||||
pkhbt r7, r3, r4, lsl #16
|
||||
|
||||
ldrneh r3, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrneh r4, [src], pstep
|
||||
|
||||
pkhbt r8, r5, r6, lsl #16
|
||||
|
||||
ldrneh r5, [src, #-2]
|
||||
pld [src, #23]
|
||||
ldrneh r6, [src], pstep
|
||||
|
||||
bne simple_vnext8
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6|
|
||||
|
||||
; Constant Pool
|
||||
c0x80808080 DCD 0x80808080
|
||||
c0x03030303 DCD 0x03030303
|
||||
c0x04040404 DCD 0x04040404
|
||||
|
||||
END
|
@ -1,273 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vp8_sixtap_predict8x4_armv6|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 int src_pixels_per_line,
|
||||
; r2 int xoffset,
|
||||
; r3 int yoffset,
|
||||
; stack unsigned char *dst_ptr,
|
||||
; stack int dst_pitch
|
||||
;-------------------------------------
|
||||
;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184.
|
||||
;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack,
|
||||
;and the result is stored in transpose.
|
||||
|vp8_sixtap_predict8x4_armv6| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset
|
||||
|
||||
cmp r2, #0 ;skip first_pass filter if xoffset=0
|
||||
add lr, sp, #4 ;point to temporary buffer
|
||||
beq skip_firstpass_filter
|
||||
|
||||
;first-pass filter
|
||||
adr r12, filter8_coeff
|
||||
sub r0, r0, r1, lsl #1
|
||||
|
||||
add r3, r1, #10 ; preload next low
|
||||
pld [r0, r3]
|
||||
|
||||
add r2, r12, r2, lsl #4 ;calculate filter location
|
||||
add r0, r0, #3 ;adjust src only for loading convinience
|
||||
|
||||
ldr r3, [r2] ; load up packed filter coefficients
|
||||
ldr r4, [r2, #4]
|
||||
ldr r5, [r2, #8]
|
||||
|
||||
mov r2, #0x90000 ; height=9 is top part of counter
|
||||
|
||||
sub r1, r1, #8
|
||||
|
||||
|first_pass_hloop_v6|
|
||||
ldrb r6, [r0, #-5] ; load source data
|
||||
ldrb r7, [r0, #-4]
|
||||
ldrb r8, [r0, #-3]
|
||||
ldrb r9, [r0, #-2]
|
||||
ldrb r10, [r0, #-1]
|
||||
|
||||
orr r2, r2, #0x4 ; construct loop counter. width=8=4x2
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; r7 | r6
|
||||
pkhbt r7, r7, r8, lsl #16 ; r8 | r7
|
||||
|
||||
pkhbt r8, r8, r9, lsl #16 ; r9 | r8
|
||||
pkhbt r9, r9, r10, lsl #16 ; r10 | r9
|
||||
|
||||
|first_pass_wloop_v6|
|
||||
smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1]
|
||||
smuad r12, r7, r3
|
||||
|
||||
ldrb r6, [r0], #1
|
||||
|
||||
smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3]
|
||||
ldrb r7, [r0], #1
|
||||
smlad r12, r9, r4, r12
|
||||
|
||||
pkhbt r10, r10, r6, lsl #16 ; r10 | r9
|
||||
pkhbt r6, r6, r7, lsl #16 ; r11 | r10
|
||||
smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5]
|
||||
smlad r12, r6, r5, r12
|
||||
|
||||
sub r2, r2, #1
|
||||
|
||||
add r11, r11, #0x40 ; round_shift_and_clamp
|
||||
tst r2, #0xff ; test loop counter
|
||||
usat r11, #8, r11, asr #7
|
||||
add r12, r12, #0x40
|
||||
strh r11, [lr], #20 ; result is transposed and stored, which
|
||||
usat r12, #8, r12, asr #7
|
||||
|
||||
strh r12, [lr], #20
|
||||
|
||||
movne r11, r6
|
||||
movne r12, r7
|
||||
|
||||
movne r6, r8
|
||||
movne r7, r9
|
||||
movne r8, r10
|
||||
movne r9, r11
|
||||
movne r10, r12
|
||||
|
||||
bne first_pass_wloop_v6
|
||||
|
||||
;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines
|
||||
;;IF ARCHITECTURE=6
|
||||
;pld [src, ppl]
|
||||
;;pld [src, r9]
|
||||
;;ENDIF
|
||||
|
||||
subs r2, r2, #0x10000
|
||||
|
||||
sub lr, lr, #158
|
||||
|
||||
add r0, r0, r1 ; move to next input line
|
||||
|
||||
add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier
|
||||
pld [r0, r11]
|
||||
|
||||
bne first_pass_hloop_v6
|
||||
|
||||
;second pass filter
|
||||
secondpass_filter
|
||||
ldr r3, [sp], #4 ; load back yoffset
|
||||
ldr r0, [sp, #216] ; load dst address from stack 180+36
|
||||
ldr r1, [sp, #220] ; load dst stride from stack 180+40
|
||||
|
||||
cmp r3, #0
|
||||
beq skip_secondpass_filter
|
||||
|
||||
adr r12, filter8_coeff
|
||||
add lr, r12, r3, lsl #4 ;calculate filter location
|
||||
|
||||
mov r2, #0x00080000
|
||||
|
||||
ldr r3, [lr] ; load up packed filter coefficients
|
||||
ldr r4, [lr, #4]
|
||||
ldr r5, [lr, #8]
|
||||
|
||||
pkhbt r12, r4, r3 ; pack the filter differently
|
||||
pkhbt r11, r5, r4
|
||||
|
||||
second_pass_hloop_v6
|
||||
ldr r6, [sp] ; load the data
|
||||
ldr r7, [sp, #4]
|
||||
|
||||
orr r2, r2, #2 ; loop counter
|
||||
|
||||
second_pass_wloop_v6
|
||||
smuad lr, r3, r6 ; apply filter
|
||||
smulbt r10, r3, r6
|
||||
|
||||
ldr r8, [sp, #8]
|
||||
|
||||
smlad lr, r4, r7, lr
|
||||
smladx r10, r12, r7, r10
|
||||
|
||||
ldrh r9, [sp, #12]
|
||||
|
||||
smlad lr, r5, r8, lr
|
||||
smladx r10, r11, r8, r10
|
||||
|
||||
add sp, sp, #4
|
||||
smlatb r10, r5, r9, r10
|
||||
|
||||
sub r2, r2, #1
|
||||
|
||||
add lr, lr, #0x40 ; round_shift_and_clamp
|
||||
tst r2, #0xff
|
||||
usat lr, #8, lr, asr #7
|
||||
add r10, r10, #0x40
|
||||
strb lr, [r0], r1 ; the result is transposed back and stored
|
||||
usat r10, #8, r10, asr #7
|
||||
|
||||
strb r10, [r0],r1
|
||||
|
||||
movne r6, r7
|
||||
movne r7, r8
|
||||
|
||||
bne second_pass_wloop_v6
|
||||
|
||||
subs r2, r2, #0x10000
|
||||
add sp, sp, #12 ; updata src for next loop (20-8)
|
||||
sub r0, r0, r1, lsl #2
|
||||
add r0, r0, #1
|
||||
|
||||
bne second_pass_hloop_v6
|
||||
|
||||
add sp, sp, #20
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
;--------------------
|
||||
skip_firstpass_filter
|
||||
sub r0, r0, r1, lsl #1
|
||||
sub r1, r1, #8
|
||||
mov r2, #9
|
||||
|
||||
skip_firstpass_hloop
|
||||
ldrb r4, [r0], #1 ; load data
|
||||
subs r2, r2, #1
|
||||
ldrb r5, [r0], #1
|
||||
strh r4, [lr], #20 ; store it to immediate buffer
|
||||
ldrb r6, [r0], #1 ; load data
|
||||
strh r5, [lr], #20
|
||||
ldrb r7, [r0], #1
|
||||
strh r6, [lr], #20
|
||||
ldrb r8, [r0], #1
|
||||
strh r7, [lr], #20
|
||||
ldrb r9, [r0], #1
|
||||
strh r8, [lr], #20
|
||||
ldrb r10, [r0], #1
|
||||
strh r9, [lr], #20
|
||||
ldrb r11, [r0], #1
|
||||
strh r10, [lr], #20
|
||||
add r0, r0, r1 ; move to next input line
|
||||
strh r11, [lr], #20
|
||||
|
||||
sub lr, lr, #158 ; move over to next column
|
||||
bne skip_firstpass_hloop
|
||||
|
||||
b secondpass_filter
|
||||
|
||||
;--------------------
|
||||
skip_secondpass_filter
|
||||
mov r2, #8
|
||||
add sp, sp, #4 ;start from src[0] instead of src[-2]
|
||||
|
||||
skip_secondpass_hloop
|
||||
ldr r6, [sp], #4
|
||||
subs r2, r2, #1
|
||||
ldr r8, [sp], #4
|
||||
|
||||
mov r7, r6, lsr #16 ; unpack
|
||||
strb r6, [r0], r1
|
||||
mov r9, r8, lsr #16
|
||||
strb r7, [r0], r1
|
||||
add sp, sp, #12 ; 20-8
|
||||
strb r8, [r0], r1
|
||||
strb r9, [r0], r1
|
||||
|
||||
sub r0, r0, r1, lsl #2
|
||||
add r0, r0, #1
|
||||
|
||||
bne skip_secondpass_hloop
|
||||
|
||||
add sp, sp, #16 ; 180 - (160 +4)
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
;-----------------
|
||||
;One word each is reserved. Label filter_coeff can be used to access the data.
|
||||
;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
|
||||
filter8_coeff
|
||||
DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000
|
||||
DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000
|
||||
DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000
|
||||
DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000
|
||||
DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000
|
||||
DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000
|
||||
DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000
|
||||
DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000
|
||||
|
||||
;DCD 0, 0, 128, 0, 0, 0
|
||||
;DCD 0, -6, 123, 12, -1, 0
|
||||
;DCD 2, -11, 108, 36, -8, 1
|
||||
;DCD 0, -9, 93, 50, -6, 0
|
||||
;DCD 3, -16, 77, 77, -16, 3
|
||||
;DCD 0, -6, 50, 93, -9, 0
|
||||
;DCD 1, -8, 36, 108, -11, 2
|
||||
;DCD 0, -1, 12, 123, -6, 0
|
||||
|
||||
END
|
@ -1,87 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include <math.h>
|
||||
#include "vp8/common/filter.h"
|
||||
#include "bilinearfilter_arm.h"
|
||||
|
||||
void vp8_filter_block2d_bil_armv6(unsigned char *src_ptr,
|
||||
unsigned char *dst_ptr,
|
||||
unsigned int src_pitch,
|
||||
unsigned int dst_pitch, const short *HFilter,
|
||||
const short *VFilter, int Width, int Height) {
|
||||
unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */
|
||||
|
||||
/* First filter 1-D horizontally... */
|
||||
vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1,
|
||||
Width, HFilter);
|
||||
|
||||
/* then 1-D vertically... */
|
||||
vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height,
|
||||
Width, VFilter);
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict4x4_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
|
||||
HFilter, VFilter, 4, 4);
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict8x8_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
|
||||
HFilter, VFilter, 8, 8);
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict8x4_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
|
||||
HFilter, VFilter, 8, 4);
|
||||
}
|
||||
|
||||
void vp8_bilinear_predict16x16_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
|
||||
HFilter = vp8_bilinear_filters[xoffset];
|
||||
VFilter = vp8_bilinear_filters[yoffset];
|
||||
|
||||
vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
|
||||
HFilter, VFilter, 16, 16);
|
||||
}
|
@ -1,31 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
|
||||
#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
extern void vp8_filter_block2d_bil_first_pass_armv6(
|
||||
const unsigned char *src_ptr, unsigned short *dst_ptr,
|
||||
unsigned int src_pitch, unsigned int height, unsigned int width,
|
||||
const short *vp8_filter);
|
||||
|
||||
extern void vp8_filter_block2d_bil_second_pass_armv6(
|
||||
const unsigned short *src_ptr, unsigned char *dst_ptr, int dst_pitch,
|
||||
unsigned int height, unsigned int width, const short *vp8_filter);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_
|
@ -1,23 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8/common/blockd.h"
|
||||
|
||||
#if HAVE_MEDIA
|
||||
extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ);
|
||||
|
||||
void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) {
|
||||
short *DQ = d->dqcoeff;
|
||||
short *Q = d->qcoeff;
|
||||
|
||||
vp8_dequantize_b_loop_v6(Q, DQC, DQ);
|
||||
}
|
||||
#endif
|
@ -1,176 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
#include <math.h>
|
||||
#include "vp8/common/filter.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
extern void vp8_filter_block2d_first_pass_armv6(
|
||||
unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line,
|
||||
unsigned int output_width, unsigned int output_height,
|
||||
const short *vp8_filter);
|
||||
|
||||
// 8x8
|
||||
extern void vp8_filter_block2d_first_pass_8x8_armv6(
|
||||
unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line,
|
||||
unsigned int output_width, unsigned int output_height,
|
||||
const short *vp8_filter);
|
||||
|
||||
// 16x16
|
||||
extern void vp8_filter_block2d_first_pass_16x16_armv6(
|
||||
unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line,
|
||||
unsigned int output_width, unsigned int output_height,
|
||||
const short *vp8_filter);
|
||||
|
||||
extern void vp8_filter_block2d_second_pass_armv6(short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int output_pitch,
|
||||
unsigned int cnt,
|
||||
const short *vp8_filter);
|
||||
|
||||
extern void vp8_filter4_block2d_second_pass_armv6(short *src_ptr,
|
||||
unsigned char *output_ptr,
|
||||
unsigned int output_pitch,
|
||||
unsigned int cnt,
|
||||
const short *vp8_filter);
|
||||
|
||||
extern void vp8_filter_block2d_first_pass_only_armv6(
|
||||
unsigned char *src_ptr, unsigned char *output_ptr,
|
||||
unsigned int src_pixels_per_line, unsigned int cnt,
|
||||
unsigned int output_pitch, const short *vp8_filter);
|
||||
|
||||
extern void vp8_filter_block2d_second_pass_only_armv6(
|
||||
unsigned char *src_ptr, unsigned char *output_ptr,
|
||||
unsigned int src_pixels_per_line, unsigned int cnt,
|
||||
unsigned int output_pitch, const short *vp8_filter);
|
||||
|
||||
#if HAVE_MEDIA
|
||||
void vp8_sixtap_predict4x4_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
DECLARE_ALIGNED(4, short,
|
||||
FData[12 * 4]); /* Temp data buffer used in filtering */
|
||||
|
||||
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
|
||||
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
|
||||
|
||||
/* Vfilter is null. First pass only */
|
||||
if (xoffset && !yoffset) {
|
||||
/*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2,
|
||||
src_pixels_per_line, 4, 4, HFilter );
|
||||
vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4,
|
||||
VFilter );*/
|
||||
|
||||
vp8_filter_block2d_first_pass_only_armv6(
|
||||
src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter);
|
||||
}
|
||||
/* Hfilter is null. Second pass only */
|
||||
else if (!xoffset && yoffset) {
|
||||
vp8_filter_block2d_second_pass_only_armv6(
|
||||
src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter);
|
||||
} else {
|
||||
/* Vfilter is a 4 tap filter */
|
||||
if (yoffset & 0x1) {
|
||||
vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line,
|
||||
FData + 1, src_pixels_per_line, 4, 7,
|
||||
HFilter);
|
||||
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4,
|
||||
VFilter);
|
||||
}
|
||||
/* Vfilter is 6 tap filter */
|
||||
else {
|
||||
vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line),
|
||||
FData, src_pixels_per_line, 4, 9,
|
||||
HFilter);
|
||||
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4,
|
||||
VFilter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_sixtap_predict8x8_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
DECLARE_ALIGNED(4, short,
|
||||
FData[16 * 8]); /* Temp data buffer used in filtering */
|
||||
|
||||
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
|
||||
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
|
||||
|
||||
if (xoffset && !yoffset) {
|
||||
vp8_filter_block2d_first_pass_only_armv6(
|
||||
src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter);
|
||||
}
|
||||
/* Hfilter is null. Second pass only */
|
||||
else if (!xoffset && yoffset) {
|
||||
vp8_filter_block2d_second_pass_only_armv6(
|
||||
src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter);
|
||||
} else {
|
||||
if (yoffset & 0x1) {
|
||||
vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line,
|
||||
FData + 1, src_pixels_per_line, 8,
|
||||
11, HFilter);
|
||||
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8,
|
||||
VFilter);
|
||||
} else {
|
||||
vp8_filter_block2d_first_pass_8x8_armv6(
|
||||
src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8,
|
||||
13, HFilter);
|
||||
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8,
|
||||
VFilter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void vp8_sixtap_predict16x16_armv6(unsigned char *src_ptr,
|
||||
int src_pixels_per_line, int xoffset,
|
||||
int yoffset, unsigned char *dst_ptr,
|
||||
int dst_pitch) {
|
||||
const short *HFilter;
|
||||
const short *VFilter;
|
||||
DECLARE_ALIGNED(4, short,
|
||||
FData[24 * 16]); /* Temp data buffer used in filtering */
|
||||
|
||||
HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */
|
||||
VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */
|
||||
|
||||
if (xoffset && !yoffset) {
|
||||
vp8_filter_block2d_first_pass_only_armv6(
|
||||
src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter);
|
||||
}
|
||||
/* Hfilter is null. Second pass only */
|
||||
else if (!xoffset && yoffset) {
|
||||
vp8_filter_block2d_second_pass_only_armv6(
|
||||
src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter);
|
||||
} else {
|
||||
if (yoffset & 0x1) {
|
||||
vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line,
|
||||
FData + 1, src_pixels_per_line,
|
||||
16, 19, HFilter);
|
||||
vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16,
|
||||
VFilter);
|
||||
} else {
|
||||
vp8_filter_block2d_first_pass_16x16_armv6(
|
||||
src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16,
|
||||
21, HFilter);
|
||||
vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16,
|
||||
VFilter);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
@ -13,18 +13,6 @@
|
||||
#include "vp8/common/loopfilter.h"
|
||||
#include "vp8/common/onyxc_int.h"
|
||||
|
||||
#define prototype_loopfilter(sym) \
|
||||
void sym(unsigned char *src, int pitch, const unsigned char *blimit, \
|
||||
const unsigned char *limit, const unsigned char *thresh, int count)
|
||||
|
||||
#if HAVE_MEDIA
|
||||
extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6);
|
||||
extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6);
|
||||
extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6);
|
||||
extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6);
|
||||
#endif
|
||||
|
||||
#if HAVE_NEON
|
||||
typedef void loopfilter_y_neon(unsigned char *src, int pitch,
|
||||
unsigned char blimit, unsigned char limit,
|
||||
unsigned char thresh);
|
||||
@ -41,101 +29,7 @@ extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
|
||||
extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
|
||||
extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
|
||||
extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
|
||||
#endif
|
||||
|
||||
#if HAVE_MEDIA
|
||||
/* ARMV6/MEDIA loopfilter functions*/
|
||||
/* Horizontal MB filtering */
|
||||
void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride,
|
||||
int uv_stride, loop_filter_info *lfi) {
|
||||
vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
/* Vertical MB Filtering */
|
||||
void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride,
|
||||
int uv_stride, loop_filter_info *lfi) {
|
||||
vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
/* Horizontal B Filtering */
|
||||
void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride, int uv_stride,
|
||||
loop_filter_info *lfi) {
|
||||
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride,
|
||||
lfi->blim, lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride,
|
||||
blimit);
|
||||
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride,
|
||||
blimit);
|
||||
vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride,
|
||||
blimit);
|
||||
}
|
||||
|
||||
/* Vertical B Filtering */
|
||||
void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
unsigned char *v_ptr, int y_stride, int uv_stride,
|
||||
loop_filter_info *lfi) {
|
||||
vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
|
||||
lfi->hev_thr, 2);
|
||||
|
||||
if (u_ptr)
|
||||
vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
|
||||
if (v_ptr)
|
||||
vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim,
|
||||
lfi->lim, lfi->hev_thr, 1);
|
||||
}
|
||||
|
||||
void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride,
|
||||
const unsigned char *blimit) {
|
||||
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit);
|
||||
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit);
|
||||
vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if HAVE_NEON
|
||||
/* NEON loopfilter functions */
|
||||
/* Horizontal MB filtering */
|
||||
void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
@ -205,4 +99,3 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr,
|
||||
vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim,
|
||||
hev_thr, v_ptr + 4);
|
||||
}
|
||||
#endif
|
||||
|
@ -29,81 +29,69 @@ $vp8_clear_system_state_mmx=vpx_reset_mmx_state;
|
||||
# Dequant
|
||||
#
|
||||
add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
|
||||
specialize qw/vp8_dequantize_b mmx media neon msa/;
|
||||
$vp8_dequantize_b_media=vp8_dequantize_b_v6;
|
||||
specialize qw/vp8_dequantize_b mmx neon msa/;
|
||||
|
||||
add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
|
||||
specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/;
|
||||
$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
|
||||
specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
|
||||
$vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
|
||||
|
||||
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
|
||||
specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
|
||||
specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/;
|
||||
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
|
||||
|
||||
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
|
||||
specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
|
||||
specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/;
|
||||
$vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
|
||||
|
||||
#
|
||||
# Loopfilter
|
||||
#
|
||||
add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6;
|
||||
specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/;
|
||||
$vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6;
|
||||
specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/;
|
||||
$vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6;
|
||||
specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/;
|
||||
$vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
|
||||
specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6;
|
||||
specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/;
|
||||
$vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2;
|
||||
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
|
||||
$vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx;
|
||||
$vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
|
||||
$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6;
|
||||
$vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
|
||||
$vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
|
||||
$vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx;
|
||||
$vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
|
||||
$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6;
|
||||
$vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
|
||||
$vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
|
||||
$vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx;
|
||||
$vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
|
||||
$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6;
|
||||
$vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
|
||||
$vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
|
||||
|
||||
add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
|
||||
specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/;
|
||||
specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/;
|
||||
$vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
|
||||
$vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx;
|
||||
$vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
|
||||
$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6;
|
||||
$vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
|
||||
$vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
|
||||
|
||||
@ -112,8 +100,7 @@ $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
|
||||
#
|
||||
#idct16
|
||||
add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
|
||||
specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/;
|
||||
$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
|
||||
specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
|
||||
$vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
|
||||
|
||||
#iwalsh1
|
||||
@ -124,32 +111,27 @@ $vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2;
|
||||
|
||||
#iwalsh16
|
||||
add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
|
||||
specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
|
||||
specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/;
|
||||
$vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
|
||||
|
||||
#idct1_scalar_add
|
||||
add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
|
||||
specialize qw/vp8_dc_only_idct_add mmx media neon dspr2 msa/;
|
||||
$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
|
||||
specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
|
||||
$vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
|
||||
|
||||
#
|
||||
# RECON
|
||||
#
|
||||
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/;
|
||||
$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6;
|
||||
specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/;
|
||||
$vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2;
|
||||
|
||||
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/;
|
||||
$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6;
|
||||
specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
|
||||
$vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2;
|
||||
|
||||
add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/;
|
||||
$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6;
|
||||
specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
|
||||
$vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2;
|
||||
|
||||
#
|
||||
@ -180,40 +162,36 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
|
||||
# Subpixel
|
||||
#
|
||||
add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 msa/;
|
||||
$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6;
|
||||
specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/;
|
||||
$vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2;
|
||||
|
||||
add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 msa/;
|
||||
$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6;
|
||||
specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/;
|
||||
$vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2;
|
||||
|
||||
add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 msa/;
|
||||
$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6;
|
||||
specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/;
|
||||
$vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2;
|
||||
|
||||
# TODO(johannkoenig): Add neon implementation
|
||||
# https://bugs.chromium.org/p/webm/issues/detail?id=1273
|
||||
add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/;
|
||||
$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6;
|
||||
specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/;
|
||||
$vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2;
|
||||
|
||||
add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/;
|
||||
$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6;
|
||||
specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/;
|
||||
$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6;
|
||||
specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/;
|
||||
$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6;
|
||||
specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
|
||||
|
||||
# TODO(johannkoenig): Add neon implementation
|
||||
# https://bugs.chromium.org/p/webm/issues/detail?id=1273
|
||||
add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
|
||||
specialize qw/vp8_bilinear_predict4x4 mmx media msa/;
|
||||
$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
|
||||
specialize qw/vp8_bilinear_predict4x4 mmx msa/;
|
||||
|
||||
#
|
||||
# Encoder functions below this point.
|
||||
@ -232,16 +210,13 @@ if ($opts{arch} =~ /x86/) {
|
||||
# Forward DCT
|
||||
#
|
||||
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
|
||||
specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/;
|
||||
$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;
|
||||
specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
|
||||
specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/;
|
||||
$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;
|
||||
specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/;
|
||||
|
||||
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
|
||||
specialize qw/vp8_short_walsh4x4 sse2 media neon msa/;
|
||||
$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;
|
||||
specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
|
||||
|
||||
#
|
||||
# Quantizer
|
||||
|
@ -1,262 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_fdct4x4_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY
|
||||
; void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
|
||||
|vp8_short_fdct4x4_armv6| PROC
|
||||
|
||||
stmfd sp!, {r4 - r12, lr}
|
||||
|
||||
; PART 1
|
||||
|
||||
; coeffs 0-3
|
||||
ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2]
|
||||
|
||||
ldr r10, c7500
|
||||
ldr r11, c14500
|
||||
ldr r12, c0x22a453a0 ; [2217*4 | 5352*4]
|
||||
ldr lr, c0x00080008
|
||||
ror r5, r5, #16 ; [i2 | i3]
|
||||
|
||||
qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift
|
||||
qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift
|
||||
|
||||
add r0, r0, r2 ; update input pointer
|
||||
|
||||
qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8
|
||||
smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6]
|
||||
|
||||
pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2
|
||||
pkhbt r6, r5, r7, lsl #4 ; [o3 | o2]
|
||||
|
||||
str r6, [r1, #4]
|
||||
|
||||
; coeffs 4-7
|
||||
ror r9, r9, #16 ; [i6 | i7]
|
||||
|
||||
qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift
|
||||
qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift
|
||||
|
||||
add r0, r0, r2 ; update input pointer
|
||||
|
||||
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8
|
||||
smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10]
|
||||
|
||||
pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2
|
||||
pkhbt r6, r8, r7, lsl #4 ; [o7 | o6]
|
||||
|
||||
str r6, [r1, #12]
|
||||
|
||||
; coeffs 8-11
|
||||
ror r5, r5, #16 ; [i10 | i11]
|
||||
|
||||
qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift
|
||||
qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift
|
||||
|
||||
add r0, r0, r2 ; update input pointer
|
||||
|
||||
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8
|
||||
smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14]
|
||||
|
||||
pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2
|
||||
pkhbt r6, r8, r7, lsl #4 ; [o11 | o10]
|
||||
|
||||
str r6, [r1, #20]
|
||||
|
||||
; coeffs 12-15
|
||||
ror r5, r5, #16 ; [i14 | i15]
|
||||
|
||||
qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift
|
||||
qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift
|
||||
|
||||
qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd
|
||||
; with 2217*4 and 5352*4 without losing the
|
||||
; sign bit (overflow)
|
||||
|
||||
smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8
|
||||
smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8
|
||||
|
||||
smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500)
|
||||
smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500)
|
||||
|
||||
pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2
|
||||
pkhbt r6, r5, r7, lsl #4 ; [o15 | o14]
|
||||
|
||||
str r6, [r1, #28]
|
||||
|
||||
|
||||
; PART 2 -------------------------------------------------
|
||||
ldr r11, c12000
|
||||
ldr r10, c51000
|
||||
ldr lr, c0x00070007
|
||||
|
||||
qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12]
|
||||
qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8]
|
||||
qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8]
|
||||
qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12]
|
||||
|
||||
qadd16 r4, r4, lr ; a1 + 7
|
||||
|
||||
add r0, r11, #0x10000 ; add (d!=0)
|
||||
|
||||
qadd16 r2, r4, r5 ; a1 + b1 + 7
|
||||
qsub16 r3, r4, r5 ; a1 - b1 + 7
|
||||
|
||||
ldr r12, c0x08a914e8 ; [2217 | 5352]
|
||||
|
||||
lsl r8, r2, #16 ; prepare bottom halfword for scaling
|
||||
asr r2, r2, #4 ; scale top halfword
|
||||
lsl r9, r3, #16 ; prepare bottom halfword for scaling
|
||||
asr r3, r3, #4 ; scale top halfword
|
||||
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
|
||||
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
|
||||
|
||||
smulbt r2, r6, r12 ; [ ------ | c1*2217]
|
||||
str r4, [r1, #0] ; [ o1 | o0]
|
||||
smultt r3, r6, r12 ; [c1*2217 | ------ ]
|
||||
str r5, [r1, #16] ; [ o9 | o8]
|
||||
|
||||
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
|
||||
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
|
||||
|
||||
smulbb r2, r6, r12 ; [ ------ | c1*5352]
|
||||
smultb r3, r6, r12 ; [c1*5352 | ------ ]
|
||||
|
||||
lsls r6, r7, #16 ; d1 != 0 ?
|
||||
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
|
||||
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
|
||||
asrs r6, r7, #16
|
||||
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
|
||||
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
|
||||
|
||||
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
|
||||
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
|
||||
|
||||
pkhtb r9, r9, r8, asr #16
|
||||
|
||||
sub r4, r4, r2
|
||||
sub r5, r5, r3
|
||||
|
||||
ldr r3, [r1, #4] ; [i3 | i2]
|
||||
|
||||
pkhtb r5, r5, r4, asr #16 ; [o13|o12]
|
||||
|
||||
str r9, [r1, #8] ; [o5 | 04]
|
||||
|
||||
ldr r9, [r1, #12] ; [i7 | i6]
|
||||
ldr r8, [r1, #28] ; [i15|i14]
|
||||
ldr r2, [r1, #20] ; [i11|i10]
|
||||
str r5, [r1, #24] ; [o13|o12]
|
||||
|
||||
qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14]
|
||||
qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10]
|
||||
|
||||
qadd16 r4, r4, lr ; a1 + 7
|
||||
|
||||
qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10]
|
||||
qadd16 r2, r4, r5 ; a1 + b1 + 7
|
||||
qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14]
|
||||
qsub16 r3, r4, r5 ; a1 - b1 + 7
|
||||
|
||||
lsl r8, r2, #16 ; prepare bottom halfword for scaling
|
||||
asr r2, r2, #4 ; scale top halfword
|
||||
lsl r9, r3, #16 ; prepare bottom halfword for scaling
|
||||
asr r3, r3, #4 ; scale top halfword
|
||||
pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword
|
||||
pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword
|
||||
|
||||
smulbt r2, r6, r12 ; [ ------ | c1*2217]
|
||||
str r4, [r1, #4] ; [ o3 | o2]
|
||||
smultt r3, r6, r12 ; [c1*2217 | ------ ]
|
||||
str r5, [r1, #20] ; [ o11 | o10]
|
||||
|
||||
smlabb r8, r7, r12, r2 ; [ ------ | d1*5352]
|
||||
smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ]
|
||||
|
||||
smulbb r2, r6, r12 ; [ ------ | c1*5352]
|
||||
smultb r3, r6, r12 ; [c1*5352 | ------ ]
|
||||
|
||||
lsls r6, r7, #16 ; d1 != 0 ?
|
||||
addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0)
|
||||
addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0)
|
||||
|
||||
asrs r6, r7, #16
|
||||
addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0)
|
||||
addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0)
|
||||
|
||||
smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000
|
||||
smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000
|
||||
|
||||
pkhtb r9, r9, r8, asr #16
|
||||
|
||||
sub r4, r4, r2
|
||||
sub r5, r5, r3
|
||||
|
||||
str r9, [r1, #12] ; [o7 | o6]
|
||||
pkhtb r5, r5, r4, asr #16 ; [o15|o14]
|
||||
|
||||
str r5, [r1, #28] ; [o15|o14]
|
||||
|
||||
ldmfd sp!, {r4 - r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; Used constants
|
||||
c7500
|
||||
DCD 7500
|
||||
c14500
|
||||
DCD 14500
|
||||
c0x22a453a0
|
||||
DCD 0x22a453a0
|
||||
c0x00080008
|
||||
DCD 0x00080008
|
||||
c12000
|
||||
DCD 12000
|
||||
c51000
|
||||
DCD 51000
|
||||
c0x00070007
|
||||
DCD 0x00070007
|
||||
c0x08a914e8
|
||||
DCD 0x08a914e8
|
||||
|
||||
END
|
@ -1,212 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
EXPORT |vp8_short_walsh4x4_armv6|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
|
||||
; r0 short *input,
|
||||
; r1 short *output,
|
||||
; r2 int pitch
|
||||
|vp8_short_walsh4x4_armv6| PROC
|
||||
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldrd r4, r5, [r0], r2
|
||||
ldr lr, c00040004
|
||||
ldrd r6, r7, [r0], r2
|
||||
|
||||
; 0-3
|
||||
qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2]
|
||||
qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2]
|
||||
|
||||
ldrd r8, r9, [r0], r2
|
||||
; 4-7
|
||||
qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6]
|
||||
qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6]
|
||||
|
||||
ldrd r10, r11, [r0]
|
||||
; 8-11
|
||||
qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10]
|
||||
qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10]
|
||||
|
||||
; 12-15
|
||||
qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14]
|
||||
qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14]
|
||||
|
||||
|
||||
lsls r2, r3, #16
|
||||
smuad r11, r3, lr ; A0 = a1<<2 + d1<<2
|
||||
addne r11, r11, #1 ; A0 += (a1!=0)
|
||||
|
||||
lsls r2, r7, #16
|
||||
smuad r12, r7, lr ; C0 = a1<<2 + d1<<2
|
||||
addne r12, r12, #1 ; C0 += (a1!=0)
|
||||
|
||||
add r0, r11, r12 ; a1_0 = A0 + C0
|
||||
sub r11, r11, r12 ; b1_0 = A0 - C0
|
||||
|
||||
lsls r2, r5, #16
|
||||
smuad r12, r5, lr ; B0 = a1<<2 + d1<<2
|
||||
addne r12, r12, #1 ; B0 += (a1!=0)
|
||||
|
||||
lsls r2, r9, #16
|
||||
smuad r2, r9, lr ; D0 = a1<<2 + d1<<2
|
||||
addne r2, r2, #1 ; D0 += (a1!=0)
|
||||
|
||||
add lr, r12, r2 ; d1_0 = B0 + D0
|
||||
sub r12, r12, r2 ; c1_0 = B0 - D0
|
||||
|
||||
; op[0,4,8,12]
|
||||
adds r2, r0, lr ; a2 = a1_0 + d1_0
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r0, r0, lr ; d2 = a1_0 - d1_0
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1] ; op[0]
|
||||
|
||||
addmi r0, r0, #1 ; += a2 < 0
|
||||
add r0, r0, #3 ; += 3
|
||||
ldr lr, c00040004
|
||||
mov r0, r0, asr #3 ; >> 3
|
||||
strh r0, [r1, #24] ; op[12]
|
||||
|
||||
adds r2, r11, r12 ; b2 = b1_0 + c1_0
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r0, r11, r12 ; c2 = b1_0 - c1_0
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #8] ; op[4]
|
||||
|
||||
addmi r0, r0, #1 ; += a2 < 0
|
||||
add r0, r0, #3 ; += 3
|
||||
smusd r3, r3, lr ; A3 = a1<<2 - d1<<2
|
||||
smusd r7, r7, lr ; C3 = a1<<2 - d1<<2
|
||||
mov r0, r0, asr #3 ; >> 3
|
||||
strh r0, [r1, #16] ; op[8]
|
||||
|
||||
|
||||
; op[3,7,11,15]
|
||||
add r0, r3, r7 ; a1_3 = A3 + C3
|
||||
sub r3, r3, r7 ; b1_3 = A3 - C3
|
||||
|
||||
smusd r5, r5, lr ; B3 = a1<<2 - d1<<2
|
||||
smusd r9, r9, lr ; D3 = a1<<2 - d1<<2
|
||||
add r7, r5, r9 ; d1_3 = B3 + D3
|
||||
sub r5, r5, r9 ; c1_3 = B3 - D3
|
||||
|
||||
adds r2, r0, r7 ; a2 = a1_3 + d1_3
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
adds r9, r3, r5 ; b2 = b1_3 + c1_3
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #6] ; op[3]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
subs r2, r3, r5 ; c2 = b1_3 - c1_3
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #14] ; op[7]
|
||||
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r9, r0, r7 ; d2 = a1_3 - d1_3
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #22] ; op[11]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
smuad r3, r4, lr ; A1 = b1<<2 + c1<<2
|
||||
smuad r5, r8, lr ; C1 = b1<<2 + c1<<2
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #30] ; op[15]
|
||||
|
||||
; op[1,5,9,13]
|
||||
add r0, r3, r5 ; a1_1 = A1 + C1
|
||||
sub r3, r3, r5 ; b1_1 = A1 - C1
|
||||
|
||||
smuad r7, r6, lr ; B1 = b1<<2 + c1<<2
|
||||
smuad r9, r10, lr ; D1 = b1<<2 + c1<<2
|
||||
add r5, r7, r9 ; d1_1 = B1 + D1
|
||||
sub r7, r7, r9 ; c1_1 = B1 - D1
|
||||
|
||||
adds r2, r0, r5 ; a2 = a1_1 + d1_1
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
adds r9, r3, r7 ; b2 = b1_1 + c1_1
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #2] ; op[1]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
subs r2, r3, r7 ; c2 = b1_1 - c1_1
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #10] ; op[5]
|
||||
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r9, r0, r5 ; d2 = a1_1 - d1_1
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #18] ; op[9]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
smusd r4, r4, lr ; A2 = b1<<2 - c1<<2
|
||||
smusd r8, r8, lr ; C2 = b1<<2 - c1<<2
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #26] ; op[13]
|
||||
|
||||
|
||||
; op[2,6,10,14]
|
||||
add r11, r4, r8 ; a1_2 = A2 + C2
|
||||
sub r12, r4, r8 ; b1_2 = A2 - C2
|
||||
|
||||
smusd r6, r6, lr ; B2 = b1<<2 - c1<<2
|
||||
smusd r10, r10, lr ; D2 = b1<<2 - c1<<2
|
||||
add r4, r6, r10 ; d1_2 = B2 + D2
|
||||
sub r8, r6, r10 ; c1_2 = B2 - D2
|
||||
|
||||
adds r2, r11, r4 ; a2 = a1_2 + d1_2
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
adds r9, r12, r8 ; b2 = b1_2 + c1_2
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #4] ; op[2]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
subs r2, r12, r8 ; c2 = b1_2 - c1_2
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #12] ; op[6]
|
||||
|
||||
addmi r2, r2, #1 ; += a2 < 0
|
||||
add r2, r2, #3 ; += 3
|
||||
subs r9, r11, r4 ; d2 = a1_2 - d1_2
|
||||
mov r2, r2, asr #3 ; >> 3
|
||||
strh r2, [r1, #20] ; op[10]
|
||||
|
||||
addmi r9, r9, #1 ; += a2 < 0
|
||||
add r9, r9, #3 ; += 3
|
||||
mov r9, r9, asr #3 ; >> 3
|
||||
strh r9, [r1, #28] ; op[14]
|
||||
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vp8_short_walsh4x4_armv6|
|
||||
|
||||
c00040004
|
||||
DCD 0x00040004
|
||||
|
||||
END
|
@ -1,21 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "vpx_config.h"
|
||||
#include "vp8_rtcd.h"
|
||||
|
||||
#if HAVE_MEDIA
|
||||
|
||||
void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch) {
|
||||
vp8_short_fdct4x4_armv6(input, output, pitch);
|
||||
vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch);
|
||||
}
|
||||
|
||||
#endif /* HAVE_MEDIA */
|
@ -123,30 +123,8 @@ ifeq ($(CONFIG_POSTPROC),yes)
|
||||
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
|
||||
endif
|
||||
|
||||
# common (c)
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c
|
||||
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c
|
||||
|
||||
# common (media)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.h
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/bilinearfilter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem8x4_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem8x8_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem16x16_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dc_only_idct_add_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/iwalsh_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/filter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/loopfilter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/simpleloopfilter_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/sixtappredict8x4_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM)
|
||||
VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c
|
||||
|
||||
# common (neon intrinsics)
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c
|
||||
VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c
|
||||
|
@ -16,10 +16,6 @@ VP8_CX_SRCS-no += $(VP8_COMMON_SRCS-no)
|
||||
VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes)
|
||||
VP8_CX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no)
|
||||
|
||||
ifeq ($(ARCH_ARM),yes)
|
||||
include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk
|
||||
endif
|
||||
|
||||
VP8_CX_SRCS-yes += vp8cx.mk
|
||||
|
||||
VP8_CX_SRCS-yes += vp8_cx_iface.c
|
||||
@ -101,6 +97,11 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes)
|
||||
VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
|
||||
endif
|
||||
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
|
||||
|
||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c
|
||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
|
||||
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
|
||||
|
@ -1,28 +0,0 @@
|
||||
##
|
||||
## Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
##
|
||||
## Use of this source code is governed by a BSD-style license
|
||||
## that can be found in the LICENSE file in the root of the source
|
||||
## tree. An additional intellectual property rights grant can be found
|
||||
## in the file PATENTS. All contributing project authors may
|
||||
## be found in the AUTHORS file in the root of the source tree.
|
||||
##
|
||||
|
||||
|
||||
VP8_CX_SRCS-$(ARCH_ARM) += vp8cx_arm.mk
|
||||
|
||||
#File list for arm
|
||||
# encoder
|
||||
VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c
|
||||
|
||||
#File list for media
|
||||
# encoder
|
||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
|
||||
VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM)
|
||||
|
||||
#File list for neon
|
||||
# encoder
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c
|
||||
VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c
|
@ -1,237 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_filter_block2d_bil_first_pass_media|
|
||||
EXPORT |vpx_filter_block2d_bil_second_pass_media|
|
||||
|
||||
AREA |.text|, CODE, READONLY ; name this block of code
|
||||
|
||||
;-------------------------------------
|
||||
; r0 unsigned char *src_ptr,
|
||||
; r1 unsigned short *dst_ptr,
|
||||
; r2 unsigned int src_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vpx_filter
|
||||
;-------------------------------------
|
||||
; The output is transposed stroed in output array to make it easy for second pass filtering.
|
||||
|vpx_filter_block2d_bil_first_pass_media| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vpx_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
mov r12, r3 ; outer-loop counter
|
||||
|
||||
add r7, r2, r4 ; preload next row
|
||||
pld [r0, r7]
|
||||
|
||||
sub r2, r2, r4 ; src increment for height loop
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
|
||||
mov r3, r3, lsl #1 ; height*2
|
||||
add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
|
||||
|
||||
mov r11, r1 ; save dst_ptr for each row
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_1st_filter
|
||||
|
||||
|bil_height_loop_1st_v6|
|
||||
ldrb r6, [r0] ; load source data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
mov lr, r4, lsr #2 ; 4-in-parellel loop counter
|
||||
|
||||
|bil_width_loop_1st_v6|
|
||||
ldrb r9, [r0, #3]
|
||||
ldrb r10, [r0, #4]
|
||||
|
||||
pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0]
|
||||
pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1]
|
||||
|
||||
smuad r6, r6, r5 ; apply the filter
|
||||
pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2]
|
||||
smuad r7, r7, r5
|
||||
pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3]
|
||||
|
||||
smuad r8, r8, r5
|
||||
smuad r9, r9, r5
|
||||
|
||||
add r0, r0, #4
|
||||
subs lr, lr, #1
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #16, r6, asr #7
|
||||
usat r7, #16, r7, asr #7
|
||||
|
||||
strh r6, [r1], r3 ; result is transposed and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strh r7, [r1], r3
|
||||
add r9, r9, #0x40
|
||||
usat r8, #16, r8, asr #7
|
||||
usat r9, #16, r9, asr #7
|
||||
|
||||
strh r8, [r1], r3 ; result is transposed and stored
|
||||
|
||||
ldrneb r6, [r0] ; load source data
|
||||
strh r9, [r1], r3
|
||||
|
||||
ldrneb r7, [r0, #1]
|
||||
ldrneb r8, [r0, #2]
|
||||
|
||||
bne bil_width_loop_1st_v6
|
||||
|
||||
add r0, r0, r2 ; move to next input row
|
||||
subs r12, r12, #1
|
||||
|
||||
add r9, r2, r4, lsl #1 ; adding back block width
|
||||
pld [r0, r9] ; preload next row
|
||||
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_1st_v6
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_1st_filter|
|
||||
|bil_height_loop_null_1st|
|
||||
mov lr, r4, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_null_1st|
|
||||
ldrb r6, [r0] ; load data
|
||||
ldrb r7, [r0, #1]
|
||||
ldrb r8, [r0, #2]
|
||||
ldrb r9, [r0, #3]
|
||||
|
||||
strh r6, [r1], r3 ; store it to immediate buffer
|
||||
add r0, r0, #4
|
||||
strh r7, [r1], r3
|
||||
subs lr, lr, #1
|
||||
strh r8, [r1], r3
|
||||
strh r9, [r1], r3
|
||||
|
||||
bne bil_width_loop_null_1st
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, r2 ; move to next input line
|
||||
add r11, r11, #2 ; move over to next column
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_1st
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
ENDP ; |vpx_filter_block2d_bil_first_pass_media|
|
||||
|
||||
|
||||
;---------------------------------
|
||||
; r0 unsigned short *src_ptr,
|
||||
; r1 unsigned char *dst_ptr,
|
||||
; r2 int dst_pitch,
|
||||
; r3 unsigned int height,
|
||||
; stack unsigned int width,
|
||||
; stack const short *vpx_filter
|
||||
;---------------------------------
|
||||
|vpx_filter_block2d_bil_second_pass_media| PROC
|
||||
stmdb sp!, {r4 - r11, lr}
|
||||
|
||||
ldr r11, [sp, #40] ; vpx_filter address
|
||||
ldr r4, [sp, #36] ; width
|
||||
|
||||
ldr r5, [r11] ; load up filter coefficients
|
||||
mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix
|
||||
mov r11, r1
|
||||
|
||||
cmp r5, #128 ; if filter coef = 128, then skip the filter
|
||||
beq bil_null_2nd_filter
|
||||
|
||||
|bil_height_loop_2nd|
|
||||
ldr r6, [r0] ; load the data
|
||||
ldr r8, [r0, #4]
|
||||
ldrh r10, [r0, #8]
|
||||
mov lr, r3, lsr #2 ; loop counter
|
||||
|
||||
|bil_width_loop_2nd|
|
||||
pkhtb r7, r6, r8 ; src[1] | src[2]
|
||||
pkhtb r9, r8, r10 ; src[3] | src[4]
|
||||
|
||||
smuad r6, r6, r5 ; apply filter
|
||||
smuad r8, r8, r5 ; apply filter
|
||||
|
||||
subs lr, lr, #1
|
||||
|
||||
smuadx r7, r7, r5 ; apply filter
|
||||
smuadx r9, r9, r5 ; apply filter
|
||||
|
||||
add r0, r0, #8
|
||||
|
||||
add r6, r6, #0x40 ; round_shift_and_clamp
|
||||
add r7, r7, #0x40
|
||||
usat r6, #8, r6, asr #7
|
||||
usat r7, #8, r7, asr #7
|
||||
strb r6, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
add r8, r8, #0x40 ; round_shift_and_clamp
|
||||
strb r7, [r1], r2
|
||||
add r9, r9, #0x40
|
||||
usat r8, #8, r8, asr #7
|
||||
usat r9, #8, r9, asr #7
|
||||
strb r8, [r1], r2 ; the result is transposed back and stored
|
||||
|
||||
ldrne r6, [r0] ; load data
|
||||
strb r9, [r1], r2
|
||||
ldrne r8, [r0, #4]
|
||||
ldrneh r10, [r0, #8]
|
||||
|
||||
bne bil_width_loop_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4 ; update src for next row
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_2nd
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
|
||||
|bil_null_2nd_filter|
|
||||
|bil_height_loop_null_2nd|
|
||||
mov lr, r3, lsr #2
|
||||
|
||||
|bil_width_loop_null_2nd|
|
||||
ldr r6, [r0], #4 ; load data
|
||||
subs lr, lr, #1
|
||||
ldr r8, [r0], #4
|
||||
|
||||
strb r6, [r1], r2 ; store data
|
||||
mov r7, r6, lsr #16
|
||||
strb r7, [r1], r2
|
||||
mov r9, r8, lsr #16
|
||||
strb r8, [r1], r2
|
||||
strb r9, [r1], r2
|
||||
|
||||
bne bil_width_loop_null_2nd
|
||||
|
||||
subs r12, r12, #1
|
||||
add r0, r0, #4
|
||||
add r11, r11, #1
|
||||
mov r1, r11
|
||||
|
||||
bne bil_height_loop_null_2nd
|
||||
|
||||
ldmia sp!, {r4 - r11, pc}
|
||||
ENDP ; |vpx_filter_block2d_second_pass_media|
|
||||
|
||||
END
|
@ -1,95 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_sad16x16_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 const unsigned char *src_ptr
|
||||
; r1 int src_stride
|
||||
; r2 const unsigned char *ref_ptr
|
||||
; r3 int ref_stride
|
||||
|vpx_sad16x16_media| PROC
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
mov r4, #0 ; sad = 0;
|
||||
mov r5, #8 ; loop count
|
||||
|
||||
loop
|
||||
; 1st row
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels (1A)
|
||||
ldr r8, [r2, #0x0] ; load 4 ref pixels (1A)
|
||||
ldr r7, [r0, #0x4] ; load 4 src pixels (1A)
|
||||
ldr r9, [r2, #0x4] ; load 4 ref pixels (1A)
|
||||
ldr r10, [r0, #0x8] ; load 4 src pixels (1B)
|
||||
ldr r11, [r0, #0xC] ; load 4 src pixels (1B)
|
||||
|
||||
usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels
|
||||
usad8 r8, r7, r9 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r12, [r2, #0x8] ; load 4 ref pixels (1B)
|
||||
ldr lr, [r2, #0xC] ; load 4 ref pixels (1B)
|
||||
|
||||
add r0, r0, r1 ; set src pointer to next row
|
||||
add r2, r2, r3 ; set dst pointer to next row
|
||||
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
|
||||
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels (2A)
|
||||
ldr r7, [r0, #0x4] ; load 4 src pixels (2A)
|
||||
add r4, r4, r8 ; add partial sad values
|
||||
|
||||
; 2nd row
|
||||
ldr r8, [r2, #0x0] ; load 4 ref pixels (2A)
|
||||
ldr r9, [r2, #0x4] ; load 4 ref pixels (2A)
|
||||
ldr r10, [r0, #0x8] ; load 4 src pixels (2B)
|
||||
ldr r11, [r0, #0xC] ; load 4 src pixels (2B)
|
||||
|
||||
usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels
|
||||
usad8 r8, r7, r9 ; calculate sad for 4 pixels
|
||||
|
||||
ldr r12, [r2, #0x8] ; load 4 ref pixels (2B)
|
||||
ldr lr, [r2, #0xC] ; load 4 ref pixels (2B)
|
||||
|
||||
add r0, r0, r1 ; set src pointer to next row
|
||||
add r2, r2, r3 ; set dst pointer to next row
|
||||
|
||||
usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels
|
||||
usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels
|
||||
|
||||
pld [r0, r1, lsl #1]
|
||||
pld [r2, r3, lsl #1]
|
||||
|
||||
subs r5, r5, #1 ; decrement loop counter
|
||||
add r4, r4, r8 ; add partial sad values
|
||||
|
||||
bne loop
|
||||
|
||||
mov r0, r4 ; return sad
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
||||
|
@ -1,80 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
#if HAVE_MEDIA
|
||||
static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 },
|
||||
{ 96, 32 }, { 80, 48 },
|
||||
{ 64, 64 }, { 48, 80 },
|
||||
{ 32, 96 }, { 16, 112 } };
|
||||
|
||||
extern void vpx_filter_block2d_bil_first_pass_media(
|
||||
const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch,
|
||||
uint32_t height, uint32_t width, const int16_t *filter);
|
||||
|
||||
extern void vpx_filter_block2d_bil_second_pass_media(
|
||||
const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch,
|
||||
uint32_t height, uint32_t width, const int16_t *filter);
|
||||
|
||||
unsigned int vpx_sub_pixel_variance8x8_media(
|
||||
const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
|
||||
const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
|
||||
uint16_t first_pass[10 * 8];
|
||||
uint8_t second_pass[8 * 8];
|
||||
const int16_t *HFilter, *VFilter;
|
||||
|
||||
HFilter = bilinear_filters_media[xoffset];
|
||||
VFilter = bilinear_filters_media[yoffset];
|
||||
|
||||
vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
|
||||
src_pixels_per_line, 9, 8, HFilter);
|
||||
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8,
|
||||
VFilter);
|
||||
|
||||
return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
}
|
||||
|
||||
unsigned int vpx_sub_pixel_variance16x16_media(
|
||||
const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset,
|
||||
const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) {
|
||||
uint16_t first_pass[36 * 16];
|
||||
uint8_t second_pass[20 * 16];
|
||||
const int16_t *HFilter, *VFilter;
|
||||
unsigned int var;
|
||||
|
||||
if (xoffset == 4 && yoffset == 0) {
|
||||
var = vpx_variance_halfpixvar16x16_h_media(
|
||||
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
} else if (xoffset == 0 && yoffset == 4) {
|
||||
var = vpx_variance_halfpixvar16x16_v_media(
|
||||
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
} else if (xoffset == 4 && yoffset == 4) {
|
||||
var = vpx_variance_halfpixvar16x16_hv_media(
|
||||
src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
|
||||
} else {
|
||||
HFilter = bilinear_filters_media[xoffset];
|
||||
VFilter = bilinear_filters_media[yoffset];
|
||||
|
||||
vpx_filter_block2d_bil_first_pass_media(
|
||||
src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter);
|
||||
vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16,
|
||||
16, VFilter);
|
||||
|
||||
var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line,
|
||||
sse);
|
||||
}
|
||||
return var;
|
||||
}
|
||||
#endif // HAVE_MEDIA
|
@ -1,182 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_variance_halfpixvar16x16_h_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance_halfpixvar16x16_h_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
||||
|
@ -1,222 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_variance_halfpixvar16x16_hv_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance_halfpixvar16x16_hv_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
add r9, r0, r1 ; pointer to pixels on the next row
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load source pixels a, row N
|
||||
ldr r6, [r0, #1] ; load source pixels b, row N
|
||||
ldr r5, [r9, #0] ; load source pixels c, row N+1
|
||||
ldr r7, [r9, #1] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load source pixels a, row N
|
||||
ldr r6, [r0, #5] ; load source pixels b, row N
|
||||
ldr r5, [r9, #4] ; load source pixels c, row N+1
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
ldr r7, [r9, #5] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load source pixels a, row N
|
||||
ldr r6, [r0, #9] ; load source pixels b, row N
|
||||
ldr r5, [r9, #8] ; load source pixels c, row N+1
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
ldr r7, [r9, #9] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load source pixels a, row N
|
||||
ldr r6, [r0, #13] ; load source pixels b, row N
|
||||
ldr r5, [r9, #12] ; load source pixels c, row N+1
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
ldr r7, [r9, #13] ; load source pixels d, row N+1
|
||||
|
||||
; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1
|
||||
mvn r7, r7
|
||||
uhsub8 r5, r5, r7
|
||||
eor r5, r5, r10
|
||||
; z = (x + y + 1) >> 1, interpolate half pixel values vertically
|
||||
mvn r5, r5
|
||||
uhsub8 r4, r4, r5
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
@ -1,184 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_variance_halfpixvar16x16_v_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance_halfpixvar16x16_v_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
ldr r10, c80808080
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov lr, #0 ; constant zero
|
||||
loop
|
||||
add r9, r0, r1 ; set src pointer to next row
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r6, [r9, #0] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r6, [r9, #4] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r6, [r9, #8] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r6, [r9, #12] ; load 4 src pixels from next row
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
|
||||
; bilinear interpolation
|
||||
mvn r6, r6
|
||||
uhsub8 r4, r4, r6
|
||||
eor r4, r4, r10
|
||||
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r6, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r6, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
c80808080
|
||||
DCD 0x80808080
|
||||
|
||||
END
|
||||
|
@ -1,358 +0,0 @@
|
||||
;
|
||||
; Copyright (c) 2011 The WebM project authors. All Rights Reserved.
|
||||
;
|
||||
; Use of this source code is governed by a BSD-style license
|
||||
; that can be found in the LICENSE file in the root of the source
|
||||
; tree. An additional intellectual property rights grant can be found
|
||||
; in the file PATENTS. All contributing project authors may
|
||||
; be found in the AUTHORS file in the root of the source tree.
|
||||
;
|
||||
|
||||
|
||||
EXPORT |vpx_variance16x16_media|
|
||||
EXPORT |vpx_variance8x8_media|
|
||||
EXPORT |vpx_mse16x16_media|
|
||||
|
||||
ARM
|
||||
REQUIRE8
|
||||
PRESERVE8
|
||||
|
||||
AREA ||.text||, CODE, READONLY, ALIGN=2
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance16x16_media| PROC
|
||||
|
||||
stmfd sp!, {r4-r12, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r8, #0 ; initialize sum = 0
|
||||
mov r11, #0 ; initialize sse = 0
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
|
||||
loop16x16
|
||||
; 1st 4 pixels
|
||||
ldr r4, [r0, #0] ; load 4 src pixels
|
||||
ldr r5, [r2, #0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
adds r8, r8, r4 ; add positive differences to sum
|
||||
subs r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r4, [r0, #4] ; load 4 src pixels
|
||||
ldr r5, [r2, #4] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r4, [r0, #8] ; load 4 src pixels
|
||||
ldr r5, [r2, #8] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r4, [r0, #12] ; load 4 src pixels
|
||||
ldr r5, [r2, #12] ; load 4 ref pixels
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r6, r4, r5 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r6, lr ; select bytes with positive difference
|
||||
usub8 r9, r5, r4 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r6, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r4, r7, lr ; calculate sum of positive differences
|
||||
usad8 r5, r6, lr ; calculate sum of negative differences
|
||||
orr r6, r6, r7 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r8, r8, r4 ; add positive differences to sum
|
||||
sub r8, r8, r5 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r5, r6 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r6, ror #8 ; another two pixels to halfwords
|
||||
smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
|
||||
subs r12, r12, #1
|
||||
|
||||
bne loop16x16
|
||||
|
||||
; return stuff
|
||||
ldr r6, [sp, #40] ; get address of sse
|
||||
mul r0, r8, r8 ; sum * sum
|
||||
str r11, [r6] ; store sse
|
||||
sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
|
||||
|
||||
ldmfd sp!, {r4-r12, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
|vpx_variance8x8_media| PROC
|
||||
|
||||
push {r4-r10, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #8 ; set loop counter to 8 (=block height)
|
||||
mov r4, #0 ; initialize sum = 0
|
||||
mov r5, #0 ; initialize sse = 0
|
||||
|
||||
loop8x8
|
||||
; 1st 4 pixels
|
||||
ldr r6, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r0, #0x4] ; load 4 src pixels
|
||||
ldr r7, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r6, r7 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r10, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r7, r6 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r6, r10, lr ; calculate sum of positive differences
|
||||
usad8 r7, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r10 ; differences of all 4 pixels
|
||||
|
||||
; calculate total sum
|
||||
add r4, r4, r6 ; add positive differences to sum
|
||||
sub r4, r4, r7 ; subtract negative differences from sum
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r7, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r10, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1)
|
||||
subs r12, r12, #1 ; next row
|
||||
smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loop8x8
|
||||
|
||||
; return stuff
|
||||
ldr r8, [sp, #32] ; get address of sse
|
||||
mul r1, r4, r4 ; sum * sum
|
||||
str r5, [r8] ; store sse
|
||||
sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6))
|
||||
|
||||
pop {r4-r10, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
; r0 unsigned char *src_ptr
|
||||
; r1 int source_stride
|
||||
; r2 unsigned char *ref_ptr
|
||||
; r3 int recon_stride
|
||||
; stack unsigned int *sse
|
||||
;
|
||||
;note: Based on vpx_variance16x16_media. In this function, sum is never used.
|
||||
; So, we can remove this part of calculation.
|
||||
|
||||
|vpx_mse16x16_media| PROC
|
||||
|
||||
push {r4-r9, lr}
|
||||
|
||||
pld [r0, r1, lsl #0]
|
||||
pld [r2, r3, lsl #0]
|
||||
|
||||
mov r12, #16 ; set loop counter to 16 (=block height)
|
||||
mov r4, #0 ; initialize sse = 0
|
||||
|
||||
loopmse
|
||||
; 1st 4 pixels
|
||||
ldr r5, [r0, #0x0] ; load 4 src pixels
|
||||
ldr r6, [r2, #0x0] ; load 4 ref pixels
|
||||
|
||||
mov lr, #0 ; constant zero
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
pld [r0, r1, lsl #1]
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
pld [r2, r3, lsl #1]
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0x4] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 2nd 4 pixels
|
||||
ldr r6, [r2, #0x4] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
ldr r5, [r0, #0x8] ; load 4 src pixels
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 3rd 4 pixels
|
||||
ldr r6, [r2, #0x8] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
ldr r5, [r0, #0xc] ; load 4 src pixels
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
|
||||
; 4th 4 pixels
|
||||
ldr r6, [r2, #0xc] ; load 4 ref pixels
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
usub8 r8, r5, r6 ; calculate difference
|
||||
add r0, r0, r1 ; set src_ptr to next row
|
||||
sel r7, r8, lr ; select bytes with positive difference
|
||||
usub8 r9, r6, r5 ; calculate difference with reversed operands
|
||||
add r2, r2, r3 ; set dst_ptr to next row
|
||||
sel r8, r9, lr ; select bytes with negative difference
|
||||
|
||||
; calculate partial sums
|
||||
usad8 r5, r7, lr ; calculate sum of positive differences
|
||||
usad8 r6, r8, lr ; calculate sum of negative differences
|
||||
orr r8, r8, r7 ; differences of all 4 pixels
|
||||
|
||||
subs r12, r12, #1 ; next row
|
||||
|
||||
; calculate sse
|
||||
uxtb16 r6, r8 ; byte (two pixels) to halfwords
|
||||
uxtb16 r7, r8, ror #8 ; another two pixels to halfwords
|
||||
smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1)
|
||||
smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2)
|
||||
|
||||
bne loopmse
|
||||
|
||||
; return stuff
|
||||
ldr r1, [sp, #28] ; get address of sse
|
||||
mov r0, r4 ; return sse
|
||||
str r4, [r1] ; store sse
|
||||
|
||||
pop {r4-r9, pc}
|
||||
|
||||
ENDP
|
||||
|
||||
END
|
@ -271,7 +271,6 @@ DSP_SRCS-yes += subtract.c
|
||||
DSP_SRCS-yes += sum_squares.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
|
||||
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c
|
||||
@ -302,12 +301,6 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)
|
||||
DSP_SRCS-yes += variance.c
|
||||
DSP_SRCS-yes += variance.h
|
||||
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM)
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
|
||||
|
||||
|
@ -960,7 +960,7 @@ add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride
|
||||
specialize qw/vpx_sad16x32 msa sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vpx_sad16x16 media neon msa sse2/;
|
||||
specialize qw/vpx_sad16x16 neon msa sse2/;
|
||||
|
||||
add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
|
||||
specialize qw/vpx_sad16x8 neon msa sse2/;
|
||||
@ -1387,7 +1387,7 @@ add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int sourc
|
||||
specialize qw/vpx_variance16x32 sse2 msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/;
|
||||
specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance16x8 sse2 neon msa/;
|
||||
@ -1396,7 +1396,7 @@ add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source
|
||||
specialize qw/vpx_variance8x16 sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x8 sse2 media neon msa/;
|
||||
specialize qw/vpx_variance8x8 sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
specialize qw/vpx_variance8x4 sse2 msa/;
|
||||
@ -1417,7 +1417,7 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
|
||||
specialize qw/vpx_get8x8var sse2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
|
||||
specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
|
||||
|
||||
add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
|
||||
specialize qw/vpx_mse16x8 sse2 msa/;
|
||||
@ -1458,7 +1458,7 @@ add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int
|
||||
specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/;
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance16x16 media neon msa sse2 ssse3/;
|
||||
specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/;
|
||||
@ -1467,7 +1467,7 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int
|
||||
specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/;
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance8x8 media neon msa sse2 ssse3/;
|
||||
specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
|
||||
|
||||
add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/;
|
||||
@ -1520,14 +1520,19 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
|
||||
#
|
||||
# Specialty Subpixel
|
||||
#
|
||||
# TODO(johannkoenig): Add neon implementations of
|
||||
# vpx_variance_halfpixvar16x16_h
|
||||
# vpx_variance_halfpixvar16x16_v
|
||||
# vpx_variance_halfpixvar16x16_hv
|
||||
# https://bugs.chromium.org/p/webm/issues/detail?id=1273
|
||||
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/;
|
||||
specialize qw/vpx_variance_halfpixvar16x16_h sse2/;
|
||||
|
||||
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/;
|
||||
specialize qw/vpx_variance_halfpixvar16x16_v sse2/;
|
||||
|
||||
add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse";
|
||||
specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/;
|
||||
specialize qw/vpx_variance_halfpixvar16x16_hv sse2/;
|
||||
|
||||
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
||||
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
|
||||
|
@ -10,8 +10,9 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "vpx_ports/arm.h"
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx_ports/arm.h"
|
||||
|
||||
#ifdef WINAPI_FAMILY
|
||||
#include <winapifamily.h>
|
||||
@ -49,9 +50,6 @@ int arm_cpu_caps(void) {
|
||||
return flags;
|
||||
}
|
||||
mask = arm_cpu_env_mask();
|
||||
#if HAVE_MEDIA
|
||||
flags |= HAS_MEDIA;
|
||||
#endif /* HAVE_MEDIA */
|
||||
#if HAVE_NEON || HAVE_NEON_ASM
|
||||
flags |= HAS_NEON;
|
||||
#endif /* HAVE_NEON || HAVE_NEON_ASM */
|
||||
@ -75,28 +73,18 @@ int arm_cpu_caps(void) {
|
||||
* instructions via their assembled hex code.
|
||||
* All of these instructions should be essentially nops.
|
||||
*/
|
||||
#if HAVE_MEDIA
|
||||
if (mask & HAS_MEDIA) __try {
|
||||
/*SHADD8 r3,r3,r3*/
|
||||
__emit(0xE6333F93);
|
||||
flags |= HAS_MEDIA;
|
||||
#if HAVE_NEON || HAVE_NEON_ASM
|
||||
if (mask & HAS_NEON) {
|
||||
__try {
|
||||
/*VORR q0,q0,q0*/
|
||||
__emit(0xF2200150);
|
||||
flags |= HAS_NEON;
|
||||
} __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_MEDIA */
|
||||
#if HAVE_NEON || HAVE_NEON_ASM
|
||||
if (mask & HAS_NEON) {
|
||||
__try {
|
||||
/*VORR q0,q0,q0*/
|
||||
__emit(0xF2200150);
|
||||
flags |= HAS_NEON;
|
||||
} __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
|
||||
/*Ignore exception.*/
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_NEON || HAVE_NEON_ASM */
|
||||
return flags & mask;
|
||||
return flags & mask;
|
||||
}
|
||||
|
||||
#elif defined(__ANDROID__) /* end _MSC_VER */
|
||||
@ -112,9 +100,6 @@ int arm_cpu_caps(void) {
|
||||
mask = arm_cpu_env_mask();
|
||||
features = android_getCpuFeatures();
|
||||
|
||||
#if HAVE_MEDIA
|
||||
flags |= HAS_MEDIA;
|
||||
#endif /* HAVE_MEDIA */
|
||||
#if HAVE_NEON || HAVE_NEON_ASM
|
||||
if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
|
||||
#endif /* HAVE_NEON || HAVE_NEON_ASM */
|
||||
@ -153,15 +138,6 @@ int arm_cpu_caps(void) {
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_NEON || HAVE_NEON_ASM */
|
||||
#if HAVE_MEDIA
|
||||
if (memcmp(buf, "CPU architecture:", 17) == 0) {
|
||||
int version;
|
||||
version = atoi(buf + 17);
|
||||
if (version >= 6) {
|
||||
flags |= HAS_MEDIA;
|
||||
}
|
||||
}
|
||||
#endif /* HAVE_MEDIA */
|
||||
}
|
||||
fclose(fin);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user