vpx/vp8/common/arm/armv6/copymem16x16_v6.asm
Taekhyun Kim 458fb8f491 utilize preload in ARMv6 MC/LPF/Copy routines
About 9~10% decoding perf improvement on non-Neon ARM cpus

Change-Id: I7dc2a026764e84e9c2faf282b4ae113090326837
2011-06-17 14:04:53 -07:00

187 lines
3.9 KiB
NASM

;
; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
; Use of this source code is governed by a BSD-style license
; that can be found in the LICENSE file in the root of the source
; tree. An additional intellectual property rights grant can be found
; in the file PATENTS. All contributing project authors may
; be found in the AUTHORS file in the root of the source tree.
;
EXPORT |vp8_copy_mem16x16_v6|
; ARM
; REQUIRE8
; PRESERVE8
AREA Block, CODE, READONLY ; name this block of code
;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride)
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|vp8_copy_mem16x16_v6| PROC
stmdb sp!, {r4 - r7}
;push {r4-r7}
;preload
pld [r0, #31] ; preload for next 16x16 block
ands r4, r0, #15
beq copy_mem16x16_fast
ands r4, r0, #7
beq copy_mem16x16_8
ands r4, r0, #3
beq copy_mem16x16_4
;copy one byte each time
ldrb r4, [r0]
ldrb r5, [r0, #1]
ldrb r6, [r0, #2]
ldrb r7, [r0, #3]
mov r12, #16
copy_mem16x16_1_loop
strb r4, [r2]
strb r5, [r2, #1]
strb r6, [r2, #2]
strb r7, [r2, #3]
ldrb r4, [r0, #4]
ldrb r5, [r0, #5]
ldrb r6, [r0, #6]
ldrb r7, [r0, #7]
subs r12, r12, #1
strb r4, [r2, #4]
strb r5, [r2, #5]
strb r6, [r2, #6]
strb r7, [r2, #7]
ldrb r4, [r0, #8]
ldrb r5, [r0, #9]
ldrb r6, [r0, #10]
ldrb r7, [r0, #11]
strb r4, [r2, #8]
strb r5, [r2, #9]
strb r6, [r2, #10]
strb r7, [r2, #11]
ldrb r4, [r0, #12]
ldrb r5, [r0, #13]
ldrb r6, [r0, #14]
ldrb r7, [r0, #15]
add r0, r0, r1
strb r4, [r2, #12]
strb r5, [r2, #13]
strb r6, [r2, #14]
strb r7, [r2, #15]
add r2, r2, r3
ldrneb r4, [r0]
ldrneb r5, [r0, #1]
ldrneb r6, [r0, #2]
ldrneb r7, [r0, #3]
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_1_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 4 bytes each time
copy_mem16x16_4
ldr r4, [r0]
ldr r5, [r0, #4]
ldr r6, [r0, #8]
ldr r7, [r0, #12]
mov r12, #16
copy_mem16x16_4_loop
subs r12, r12, #1
add r0, r0, r1
str r4, [r2]
str r5, [r2, #4]
str r6, [r2, #8]
str r7, [r2, #12]
add r2, r2, r3
ldrne r4, [r0]
ldrne r5, [r0, #4]
ldrne r6, [r0, #8]
ldrne r7, [r0, #12]
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_4_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 8 bytes each time
copy_mem16x16_8
sub r1, r1, #16
sub r3, r3, #16
mov r12, #16
copy_mem16x16_8_loop
ldmia r0!, {r4-r5}
;ldm r0, {r4-r5}
ldmia r0!, {r6-r7}
add r0, r0, r1
stmia r2!, {r4-r5}
subs r12, r12, #1
;stm r2, {r4-r5}
stmia r2!, {r6-r7}
add r2, r2, r3
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_8_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
;copy 16 bytes each time
copy_mem16x16_fast
;sub r1, r1, #16
;sub r3, r3, #16
mov r12, #16
copy_mem16x16_fast_loop
ldmia r0, {r4-r7}
;ldm r0, {r4-r7}
add r0, r0, r1
subs r12, r12, #1
stmia r2, {r4-r7}
;stm r2, {r4-r7}
add r2, r2, r3
pld [r0, #31] ; preload for next 16x16 block
bne copy_mem16x16_fast_loop
ldmia sp!, {r4 - r7}
;pop {r4-r7}
mov pc, lr
ENDP ; |vp8_copy_mem16x16_v6|
END