vpx/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm

;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


    EXPORT  |vp8_yv12_copy_src_frame_func_neon|
    ARM
    REQUIRE8
    PRESERVE8

    INCLUDE vpx_scale_asm_offsets.asm

    AREA ||.text||, CODE, READONLY, ALIGN=2
;Note: This function is used to copy source data in src_buffer[i] at beginning of
;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,
;which can be ANY numbers(NOT always multiples of 16 or 4).

;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);

|vp8_yv12_copy_src_frame_func_neon| PROC
    push            {r4 - r11, lr}
    vpush           {d8 - d15}

    ;Copy Y plane
    ldr             r4, [r0, #yv12_buffer_config_y_height]
    ldr             r5, [r0, #yv12_buffer_config_y_width]
    ldr             r6, [r0, #yv12_buffer_config_y_stride]
    ldr             r7, [r1, #yv12_buffer_config_y_stride]
    ldr             r2, [r0, #yv12_buffer_config_y_buffer]       ;srcptr1
    ldr             r3, [r1, #yv12_buffer_config_y_buffer]       ;dstptr1

    add             r10, r2, r6             ;second row src
    add             r11, r3, r7             ;second row dst
    mov             r6, r6, lsl #1
    mov             r7, r7, lsl #1
    sub             r6, r6, r5              ;adjust stride
    sub             r7, r7, r5

    ; copy two rows at one time
    mov             lr, r4, lsr #1

cp_src_to_dst_height_loop
    mov             r12, r5

cp_width_128_loop
    vld1.8          {q0, q1}, [r2]!
    vld1.8          {q4, q5}, [r10]!
    vld1.8          {q2, q3}, [r2]!
    vld1.8          {q6, q7}, [r10]!
    vld1.8          {q8, q9}, [r2]!
    vld1.8          {q12, q13}, [r10]!
    vld1.8          {q10, q11}, [r2]!
    vld1.8          {q14, q15}, [r10]!
    sub             r12, r12, #128
    cmp             r12, #128
    vst1.8          {q0, q1}, [r3]!
    vst1.8          {q4, q5}, [r11]!
    vst1.8          {q2, q3}, [r3]!
    vst1.8          {q6, q7}, [r11]!
    vst1.8          {q8, q9}, [r3]!
    vst1.8          {q12, q13}, [r11]!
    vst1.8          {q10, q11}, [r3]!
    vst1.8          {q14, q15}, [r11]!
    bhs             cp_width_128_loop

    cmp             r12, #0
    beq             cp_width_done

cp_width_8_loop
    vld1.8          {d0}, [r2]!
    vld1.8          {d1}, [r10]!
    sub             r12, r12, #8
    cmp             r12, #8
    vst1.8          {d0}, [r3]!
    vst1.8          {d1}, [r11]!
    bhs             cp_width_8_loop

    cmp             r12, #0
    beq             cp_width_done

cp_width_1_loop
    ldrb            r8, [r2], #1
    subs            r12, r12, #1
    strb            r8, [r3], #1
    ldrb            r8, [r10], #1
    strb            r8, [r11], #1
    bne             cp_width_1_loop

cp_width_done
    subs            lr, lr, #1
    add             r2, r2, r6
    add             r3, r3, r7
    add             r10, r10, r6
    add             r11, r11, r7
    bne             cp_src_to_dst_height_loop

;copy last line for Y if y_height is odd
    tst             r4, #1
    beq             cp_width_done_1
    mov             r12, r5

cp_width_128_loop_1
    vld1.8          {q0, q1}, [r2]!
    vld1.8          {q2, q3}, [r2]!
    vld1.8          {q8, q9}, [r2]!
    vld1.8          {q10, q11}, [r2]!
    sub             r12, r12, #128
    cmp             r12, #128
    vst1.8          {q0, q1}, [r3]!
    vst1.8          {q2, q3}, [r3]!
    vst1.8          {q8, q9}, [r3]!
    vst1.8          {q10, q11}, [r3]!
    bhs             cp_width_128_loop_1

    cmp             r12, #0
    beq             cp_width_done_1

cp_width_8_loop_1
    vld1.8          {d0}, [r2]!
    sub             r12, r12, #8
    cmp             r12, #8
    vst1.8          {d0}, [r3]!
    bhs             cp_width_8_loop_1

    cmp             r12, #0
    beq             cp_width_done_1

cp_width_1_loop_1
    ldrb            r8, [r2], #1
    subs            r12, r12, #1
    strb            r8, [r3], #1
    bne             cp_width_1_loop_1
cp_width_done_1

;Copy U & V planes
    ldr             r4, [r0, #yv12_buffer_config_uv_height]
    ldr             r5, [r0, #yv12_buffer_config_uv_width]
    ldr             r6, [r0, #yv12_buffer_config_uv_stride]
    ldr             r7, [r1, #yv12_buffer_config_uv_stride]
    ldr             r2, [r0, #yv12_buffer_config_u_buffer]       ;srcptr1
    ldr             r3, [r1, #yv12_buffer_config_u_buffer]       ;dstptr1

    add             r10, r2, r6             ;second row src
    add             r11, r3, r7             ;second row dst
    mov             r6, r6, lsl #1
    mov             r7, r7, lsl #1
    sub             r6, r6, r5              ;adjust stride
    sub             r7, r7, r5

    mov             r9, #2

cp_uv_loop
    ;copy two rows at one time
    mov             lr, r4, lsr #1

cp_src_to_dst_height_uv_loop
    mov             r12, r5

cp_width_uv_64_loop
    vld1.8          {q0, q1}, [r2]!
    vld1.8          {q4, q5}, [r10]!
    vld1.8          {q2, q3}, [r2]!
    vld1.8          {q6, q7}, [r10]!
    sub             r12, r12, #64
    cmp             r12, #64
    vst1.8          {q0, q1}, [r3]!
    vst1.8          {q4, q5}, [r11]!
    vst1.8          {q2, q3}, [r3]!
    vst1.8          {q6, q7}, [r11]!
    bhs             cp_width_uv_64_loop

    cmp             r12, #0
    beq             cp_width_uv_done

cp_width_uv_8_loop
    vld1.8          {d0}, [r2]!
    vld1.8          {d1}, [r10]!
    sub             r12, r12, #8
    cmp             r12, #8
    vst1.8          {d0}, [r3]!
    vst1.8          {d1}, [r11]!
    bhs             cp_width_uv_8_loop

    cmp             r12, #0
    beq             cp_width_uv_done

cp_width_uv_1_loop
    ldrb            r8, [r2], #1
    subs            r12, r12, #1
    strb            r8, [r3], #1
    ldrb            r8, [r10], #1
    strb            r8, [r11], #1
    bne             cp_width_uv_1_loop

cp_width_uv_done
    subs            lr, lr, #1
    add             r2, r2, r6
    add             r3, r3, r7
    add             r10, r10, r6
    add             r11, r11, r7
    bne             cp_src_to_dst_height_uv_loop

;copy last line for U & V if uv_height is odd
    tst             r4, #1
    beq             cp_width_uv_done_1
    mov             r12, r5

cp_width_uv_64_loop_1
    vld1.8          {q0, q1}, [r2]!
    vld1.8          {q2, q3}, [r2]!
    sub             r12, r12, #64
    cmp             r12, #64
    vst1.8          {q0, q1}, [r3]!
    vst1.8          {q2, q3}, [r3]!
    bhs             cp_width_uv_64_loop_1

    cmp             r12, #0
    beq             cp_width_uv_done_1

cp_width_uv_8_loop_1
    vld1.8          {d0}, [r2]!
    sub             r12, r12, #8
    cmp             r12, #8
    vst1.8          {d0}, [r3]!
    bhs             cp_width_uv_8_loop_1

    cmp             r12, #0
    beq             cp_width_uv_done_1

cp_width_uv_1_loop_1
    ldrb            r8, [r2], #1
    subs            r12, r12, #1
    strb            r8, [r3], #1
    bne             cp_width_uv_1_loop_1
cp_width_uv_done_1

    subs            r9, r9, #1
    ldrne           r2, [r0, #yv12_buffer_config_v_buffer]      ;srcptr1
    ldrne           r3, [r1, #yv12_buffer_config_v_buffer]      ;dstptr1
    ldrne           r10, [r0, #yv12_buffer_config_uv_stride]
    ldrne           r11, [r1, #yv12_buffer_config_uv_stride]

    addne           r10, r2, r10                ;second row src
    addne           r11, r3, r11                ;second row dst

    bne             cp_uv_loop

    vpop            {d8 - d15}
    pop             {r4 - r11, pc}

    ENDP
    END
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
Use WebM in copyright notice for consistency Changes 'The VP8 project' to 'The WebM project', for consistency with other webmproject.org repositories. Fixes issue #97. Change-Id: I37c13ed5fbdb9d334ceef71c6350e9febed9bbba 2010-09-09 14:16:39 +02:00			`; Copyright (c) 2010 The WebM project authors. All Rights Reserved.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; Use of this source code is governed by a BSD-style license`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; that can be found in the LICENSE file in the root of the source`
			`; tree. An additional intellectual property rights grant can be found`
cosmetics: trim trailing whitespace When the license headers were updated, they accidentally contained trailing whitespace, so unfortunately we have to touch all the files again. Change-Id: I236c05fade06589e417179c0444cb39b09e4200d 2010-06-18 18:39:21 +02:00			`; in the file PATENTS. All contributing project authors may`
LICENSE: update with latest text Change-Id: Ieebea089095d9073b3a94932791099f614ce120c 2010-06-04 22:19:40 +02:00			`; be found in the AUTHORS file in the root of the source tree.`
Initial WebM release 2010-05-18 17:58:33 +02:00			`;`


			`EXPORT \|vp8_yv12_copy_src_frame_func_neon\|`
			`ARM`
			`REQUIRE8`
			`PRESERVE8`

Update ARM for vpx_scale changes Refactor asm_offsets for vpx_scale. Change-Id: I2db0eeb28c8e757bd033c6614a1e5319a1a204a5 2012-11-30 21:25:01 +01:00			`INCLUDE vpx_scale_asm_offsets.asm`
Initial WebM release 2010-05-18 17:58:33 +02:00
			`AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2`
			`;Note: This function is used to copy source data in src_buffer[i] at beginning of`
			`;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,`
			`;which can be ANY numbers(NOT always multiples of 16 or 4).`

			`;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG src_ybc, YV12_BUFFER_CONFIG dst_ybc);`

			`\|vp8_yv12_copy_src_frame_func_neon\| PROC`
			`push {r4 - r11, lr}`
			`vpush {d8 - d15}`

			`;Copy Y plane`
			`ldr r4, [r0, #yv12_buffer_config_y_height]`
			`ldr r5, [r0, #yv12_buffer_config_y_width]`
			`ldr r6, [r0, #yv12_buffer_config_y_stride]`
			`ldr r7, [r1, #yv12_buffer_config_y_stride]`
			`ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1`
			`ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1`

			`add r10, r2, r6 ;second row src`
			`add r11, r3, r7 ;second row dst`
			`mov r6, r6, lsl #1`
			`mov r7, r7, lsl #1`
			`sub r6, r6, r5 ;adjust stride`
			`sub r7, r7, r5`

			`; copy two rows at one time`
			`mov lr, r4, lsr #1`

			`cp_src_to_dst_height_loop`
			`mov r12, r5`

			`cp_width_128_loop`
			`vld1.8 {q0, q1}, [r2]!`
			`vld1.8 {q4, q5}, [r10]!`
			`vld1.8 {q2, q3}, [r2]!`
			`vld1.8 {q6, q7}, [r10]!`
			`vld1.8 {q8, q9}, [r2]!`
			`vld1.8 {q12, q13}, [r10]!`
			`vld1.8 {q10, q11}, [r2]!`
			`vld1.8 {q14, q15}, [r10]!`
			`sub r12, r12, #128`
			`cmp r12, #128`
			`vst1.8 {q0, q1}, [r3]!`
			`vst1.8 {q4, q5}, [r11]!`
			`vst1.8 {q2, q3}, [r3]!`
			`vst1.8 {q6, q7}, [r11]!`
			`vst1.8 {q8, q9}, [r3]!`
			`vst1.8 {q12, q13}, [r11]!`
			`vst1.8 {q10, q11}, [r3]!`
			`vst1.8 {q14, q15}, [r11]!`
			`bhs cp_width_128_loop`

			`cmp r12, #0`
			`beq cp_width_done`

			`cp_width_8_loop`
			`vld1.8 {d0}, [r2]!`
			`vld1.8 {d1}, [r10]!`
			`sub r12, r12, #8`
			`cmp r12, #8`
			`vst1.8 {d0}, [r3]!`
			`vst1.8 {d1}, [r11]!`
			`bhs cp_width_8_loop`

			`cmp r12, #0`
			`beq cp_width_done`

			`cp_width_1_loop`
			`ldrb r8, [r2], #1`
			`subs r12, r12, #1`
			`strb r8, [r3], #1`
			`ldrb r8, [r10], #1`
			`strb r8, [r11], #1`
			`bne cp_width_1_loop`

			`cp_width_done`
			`subs lr, lr, #1`
			`add r2, r2, r6`
			`add r3, r3, r7`
			`add r10, r10, r6`
			`add r11, r11, r7`
			`bne cp_src_to_dst_height_loop`

			`;copy last line for Y if y_height is odd`
			`tst r4, #1`
			`beq cp_width_done_1`
			`mov r12, r5`

			`cp_width_128_loop_1`
			`vld1.8 {q0, q1}, [r2]!`
			`vld1.8 {q2, q3}, [r2]!`
			`vld1.8 {q8, q9}, [r2]!`
			`vld1.8 {q10, q11}, [r2]!`
			`sub r12, r12, #128`
			`cmp r12, #128`
			`vst1.8 {q0, q1}, [r3]!`
			`vst1.8 {q2, q3}, [r3]!`
			`vst1.8 {q8, q9}, [r3]!`
			`vst1.8 {q10, q11}, [r3]!`
			`bhs cp_width_128_loop_1`

			`cmp r12, #0`
			`beq cp_width_done_1`

			`cp_width_8_loop_1`
			`vld1.8 {d0}, [r2]!`
			`sub r12, r12, #8`
			`cmp r12, #8`
			`vst1.8 {d0}, [r3]!`
			`bhs cp_width_8_loop_1`

			`cmp r12, #0`
			`beq cp_width_done_1`

			`cp_width_1_loop_1`
			`ldrb r8, [r2], #1`
			`subs r12, r12, #1`
			`strb r8, [r3], #1`
			`bne cp_width_1_loop_1`
			`cp_width_done_1`

			`;Copy U & V planes`
			`ldr r4, [r0, #yv12_buffer_config_uv_height]`
			`ldr r5, [r0, #yv12_buffer_config_uv_width]`
			`ldr r6, [r0, #yv12_buffer_config_uv_stride]`
			`ldr r7, [r1, #yv12_buffer_config_uv_stride]`
			`ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1`
			`ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1`

			`add r10, r2, r6 ;second row src`
			`add r11, r3, r7 ;second row dst`
			`mov r6, r6, lsl #1`
			`mov r7, r7, lsl #1`
			`sub r6, r6, r5 ;adjust stride`
			`sub r7, r7, r5`

			`mov r9, #2`

			`cp_uv_loop`
			`;copy two rows at one time`
			`mov lr, r4, lsr #1`

			`cp_src_to_dst_height_uv_loop`
			`mov r12, r5`

			`cp_width_uv_64_loop`
			`vld1.8 {q0, q1}, [r2]!`
			`vld1.8 {q4, q5}, [r10]!`
			`vld1.8 {q2, q3}, [r2]!`
			`vld1.8 {q6, q7}, [r10]!`
			`sub r12, r12, #64`
			`cmp r12, #64`
			`vst1.8 {q0, q1}, [r3]!`
			`vst1.8 {q4, q5}, [r11]!`
			`vst1.8 {q2, q3}, [r3]!`
			`vst1.8 {q6, q7}, [r11]!`
			`bhs cp_width_uv_64_loop`

			`cmp r12, #0`
			`beq cp_width_uv_done`

			`cp_width_uv_8_loop`
			`vld1.8 {d0}, [r2]!`
			`vld1.8 {d1}, [r10]!`
			`sub r12, r12, #8`
			`cmp r12, #8`
			`vst1.8 {d0}, [r3]!`
			`vst1.8 {d1}, [r11]!`
			`bhs cp_width_uv_8_loop`

			`cmp r12, #0`
			`beq cp_width_uv_done`

			`cp_width_uv_1_loop`
			`ldrb r8, [r2], #1`
			`subs r12, r12, #1`
			`strb r8, [r3], #1`
			`ldrb r8, [r10], #1`
			`strb r8, [r11], #1`
			`bne cp_width_uv_1_loop`

			`cp_width_uv_done`
			`subs lr, lr, #1`
			`add r2, r2, r6`
			`add r3, r3, r7`
			`add r10, r10, r6`
			`add r11, r11, r7`
			`bne cp_src_to_dst_height_uv_loop`

			`;copy last line for U & V if uv_height is odd`
			`tst r4, #1`
			`beq cp_width_uv_done_1`
			`mov r12, r5`

			`cp_width_uv_64_loop_1`
			`vld1.8 {q0, q1}, [r2]!`
			`vld1.8 {q2, q3}, [r2]!`
			`sub r12, r12, #64`
			`cmp r12, #64`
			`vst1.8 {q0, q1}, [r3]!`
			`vst1.8 {q2, q3}, [r3]!`
			`bhs cp_width_uv_64_loop_1`

			`cmp r12, #0`
			`beq cp_width_uv_done_1`

			`cp_width_uv_8_loop_1`
			`vld1.8 {d0}, [r2]!`
			`sub r12, r12, #8`
			`cmp r12, #8`
			`vst1.8 {d0}, [r3]!`
			`bhs cp_width_uv_8_loop_1`

			`cmp r12, #0`
			`beq cp_width_uv_done_1`

			`cp_width_uv_1_loop_1`
			`ldrb r8, [r2], #1`
			`subs r12, r12, #1`
			`strb r8, [r3], #1`
			`bne cp_width_uv_1_loop_1`
			`cp_width_uv_done_1`

			`subs r9, r9, #1`
			`ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1`
			`ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1`
			`ldrne r10, [r0, #yv12_buffer_config_uv_stride]`
			`ldrne r11, [r1, #yv12_buffer_config_uv_stride]`

			`addne r10, r2, r10 ;second row src`
			`addne r11, r3, r11 ;second row dst`

			`bne cp_uv_loop`

			`vpop {d8 - d15}`
			`pop {r4 - r11, pc}`

			`ENDP`
			`END`