Convert all tabs to spaces in assembly sources, unify indentation
Previously the assembly sources had mixed indentation consisting of both spaces and tabs, making it quite hard to read unless the right tab size was used in the editor. Tabs have been interpreted as 4 spaces in most cases, matching the surrounding code.
This commit is contained in:
@@ -36,17 +36,17 @@
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro SQR_ADD_16BYTES
|
||||
vmull.u8 q3, $0, $0
|
||||
vmull.u8 q8, $1, $1
|
||||
vpadal.u16 $2, q3
|
||||
vpadal.u16 $2, q8
|
||||
vmull.u8 q3, $0, $0
|
||||
vmull.u8 q8, $1, $1
|
||||
vpadal.u16 $2, q3
|
||||
vpadal.u16 $2, q8
|
||||
.endm
|
||||
#else
|
||||
.macro SQR_ADD_16BYTES arg0, arg1, arg2
|
||||
vmull.u8 q3, \arg0, \arg0
|
||||
vmull.u8 q8, \arg1, \arg1
|
||||
vpadal.u16 \arg2, q3
|
||||
vpadal.u16 \arg2, q8
|
||||
vmull.u8 q3, \arg0, \arg0
|
||||
vmull.u8 q8, \arg1, \arg1
|
||||
vpadal.u16 \arg2, q3
|
||||
vpadal.u16 \arg2, q8
|
||||
.endm
|
||||
#endif
|
||||
|
||||
@@ -54,66 +54,66 @@
|
||||
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
|
||||
stmdb sp!, {r4}
|
||||
|
||||
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
|
||||
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
|
||||
|
||||
|
||||
vabd.u8 q13, q14, q15
|
||||
vmull.u8 q12, d27, d27
|
||||
vmull.u8 q11, d26, d26
|
||||
vaddl.u16 q12, d24, d25
|
||||
vpadal.u16 q12, q11 //sqr
|
||||
vabd.u8 q13, q14, q15
|
||||
vmull.u8 q12, d27, d27
|
||||
vmull.u8 q11, d26, d26
|
||||
vaddl.u16 q12, d24, d25
|
||||
vpadal.u16 q12, q11 //sqr
|
||||
|
||||
vaddl.u8 q13, d26, d27 //sum
|
||||
|
||||
vaddl.u8 q10, d28, d29 //sum_cur
|
||||
vaddl.u8 q10, d28, d29 //sum_cur
|
||||
|
||||
vmull.u8 q9, d29, d29
|
||||
vmull.u8 q8, d28, d28
|
||||
vaddl.u16 q9, d18, d19 //sqr_cur
|
||||
vpadal.u16 q9, q8
|
||||
vmull.u8 q9, d29, d29
|
||||
vmull.u8 q8, d28, d28
|
||||
vaddl.u16 q9, d18, d19 //sqr_cur
|
||||
vpadal.u16 q9, q8
|
||||
|
||||
mov r4, #15
|
||||
mov r4, #15
|
||||
pixel_var_16x16_loop0:
|
||||
|
||||
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
|
||||
|
||||
vabd.u8 q2, q0, q1
|
||||
vabd.u8 q2, q0, q1
|
||||
|
||||
//q10 save sum_cur
|
||||
vpadal.u8 q10, q1
|
||||
//q10 save sum_cur
|
||||
vpadal.u8 q10, q1
|
||||
|
||||
//q12 save sqr
|
||||
SQR_ADD_16BYTES d4, d5, q12
|
||||
//q12 save sqr
|
||||
SQR_ADD_16BYTES d4, d5, q12
|
||||
|
||||
//q13 save sum
|
||||
vpadal.u8 q13, q2
|
||||
vpadal.u8 q13, q2
|
||||
|
||||
subs r4, #1
|
||||
subs r4, #1
|
||||
|
||||
//q9 save sqr_cur
|
||||
SQR_ADD_16BYTES d2, d3, q9
|
||||
//q9 save sqr_cur
|
||||
SQR_ADD_16BYTES d2, d3, q9
|
||||
|
||||
bne pixel_var_16x16_loop0
|
||||
bne pixel_var_16x16_loop0
|
||||
|
||||
vadd.u16 d0, d26, d27 //sum
|
||||
vadd.u16 d1, d20, d21 //sum_cur
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d24, d25 //sqr
|
||||
vadd.u32 d3, d18, d19 //sqr_cur
|
||||
vpadd.u32 d0, d0, d1
|
||||
vpadd.u32 d1, d2, d3
|
||||
vadd.u16 d0, d26, d27 //sum
|
||||
vadd.u16 d1, d20, d21 //sum_cur
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d24, d25 //sqr
|
||||
vadd.u32 d3, d18, d19 //sqr_cur
|
||||
vpadd.u32 d0, d0, d1
|
||||
vpadd.u32 d1, d2, d3
|
||||
|
||||
ldr r4, [sp, #4]
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
vshr.u32 q0, q0, #8
|
||||
vmul.u32 d0, d0
|
||||
vsub.u32 d0, d1, d0
|
||||
vshr.u32 q0, q0, #8
|
||||
vmul.u32 d0, d0
|
||||
vsub.u32 d0, d1, d0
|
||||
vmovl.u32 q0, d0
|
||||
vst2.16 {d0[0], d1[0]}, [r4]
|
||||
vst2.16 {d0[0], d1[0]}, [r4]
|
||||
|
||||
ldmia sp!, {r4}
|
||||
ldmia sp!, {r4}
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@@ -30,313 +30,313 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#ifdef HAVE_NEON
|
||||
.text
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
lsr r5, #1
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
lsr r5, #1
|
||||
|
||||
//Save the tailer for the unasigned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
//Save the tailer for the unasigned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_loop0:
|
||||
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
add lr, #32
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
add lr, #32
|
||||
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r6, r3, lsl #1
|
||||
movcs r2, r6
|
||||
addcs r7, r2, r3
|
||||
addcs r8, r1
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_loop0
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r6, r3, lsl #1
|
||||
movcs r2, r6
|
||||
addcs r7, r2, r3
|
||||
addcs r8, r1
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_loop0
|
||||
|
||||
//restore the tailer for the unasigned size
|
||||
vst1.32 {q15}, [r0]
|
||||
//restore the tailer for the unasigned size
|
||||
vst1.32 {q15}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
lsr r5, #1
|
||||
lsr r5, #1
|
||||
|
||||
//processing a colume data
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x8_loop0:
|
||||
|
||||
lsr r6, r4, #3
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
lsr r6, r4, #3
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x8_loop1:
|
||||
|
||||
vld1.8 {d0}, [r2]!
|
||||
vld1.8 {d1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vrshr.u16 q0, #1
|
||||
vrhadd.u16 d0, d1
|
||||
vld1.8 {d0}, [r2]!
|
||||
vld1.8 {d1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vrshr.u16 q0, #1
|
||||
vrhadd.u16 d0, d1
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x8_loop1
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x8_loop1
|
||||
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x8_loop0
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x8_loop0
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
lsr r5, #1
|
||||
lsr r5, #1
|
||||
|
||||
//processing a colume data
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x16_loop0:
|
||||
|
||||
lsr r6, r4, #4
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
lsr r6, r4, #4
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x16_loop1:
|
||||
|
||||
vld1.8 {q0}, [r2]!
|
||||
vld1.8 {q1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrhadd.u16 q0, q1
|
||||
vld1.8 {q0}, [r2]!
|
||||
vld1.8 {q1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrhadd.u16 q0, q1
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x16_loop1
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x16_loop1
|
||||
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x16_loop0
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x16_loop0
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
lsr r5, #1
|
||||
lsr r5, #1
|
||||
|
||||
//processing a colume data
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x32_loop0:
|
||||
|
||||
lsr r6, r4, #5
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
lsr r6, r4, #5
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x32_loop1:
|
||||
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x32_loop1
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x32_loop1
|
||||
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x32_loop0
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x32_loop0
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
|
||||
stmdb sp!, {r4-r12, lr}
|
||||
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
ldr r5, [sp, #44] //the value of src_stride
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
ldr r5, [sp, #44] //the value of src_stride
|
||||
ldr r6, [sp, #48] //the value of scaleX
|
||||
ldr r7, [sp, #52] //the value of scaleY
|
||||
|
||||
mov r10, #32768
|
||||
sub r10, #1
|
||||
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
|
||||
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
|
||||
mov r11, #-1
|
||||
mul r11, r8 // r11 -uinc
|
||||
mul r11, r8 // r11 -uinc
|
||||
|
||||
vdup.s16 d2, r8
|
||||
vdup.s16 d0, r11
|
||||
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
|
||||
|
||||
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
|
||||
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
|
||||
mov r11, #-1
|
||||
mul r11, r9 // r11 -vinc
|
||||
mul r11, r9 // r11 -vinc
|
||||
|
||||
vdup.s16 d2, r9
|
||||
vdup.s16 d3, r11
|
||||
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
|
||||
vdup.s16 d2, r9
|
||||
vdup.s16 d3, r11
|
||||
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
|
||||
|
||||
mov r11, #0x40000000
|
||||
mov r11, #0x40000000
|
||||
mov r12, #0x4000
|
||||
sub r12, #1
|
||||
add r11, r12
|
||||
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
|
||||
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
|
||||
|
||||
mov r11, #16384
|
||||
mov r11, #16384
|
||||
vdup.s16 d16, r11
|
||||
sub r11, #1
|
||||
vdup.s16 d17, r11
|
||||
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
|
||||
vdup.s16 d17, r11
|
||||
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
|
||||
|
||||
veor q14, q14
|
||||
sub r1, r2 // stride - width
|
||||
mov r8, #16384 // yInverse
|
||||
sub r3, #1
|
||||
veor q14, q14
|
||||
sub r1, r2 // stride - width
|
||||
mov r8, #16384 // yInverse
|
||||
sub r3, #1
|
||||
|
||||
_HEIGHT:
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
mov r11, r8
|
||||
lsr r11, #15
|
||||
mul r11, r5
|
||||
add r11, r4 // get current row address
|
||||
mov r12, r11
|
||||
add r12, r5
|
||||
mov r11, r8
|
||||
lsr r11, #15
|
||||
mul r11, r5
|
||||
add r11, r4 // get current row address
|
||||
mov r12, r11
|
||||
add r12, r5
|
||||
|
||||
mov r9, #16384 // xInverse
|
||||
sub r10, r2, #1
|
||||
mov r9, #16384 // xInverse
|
||||
sub r10, r2, #1
|
||||
vmov.s16 d6, d1
|
||||
|
||||
_WIDTH:
|
||||
mov lr, r9
|
||||
lsr lr, #15
|
||||
mov lr, r9
|
||||
lsr lr, #15
|
||||
add r4, r11,lr
|
||||
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
|
||||
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
|
||||
add r4, r12,lr
|
||||
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
|
||||
vzip.32 d28, d29 //q14: 000d000c000b000a;
|
||||
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
|
||||
vzip.32 d28, d29 //q14: 000d000c000b000a;
|
||||
|
||||
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||
vmull.u32 q12, d26,d28
|
||||
vmlal.u32 q12, d27,d29
|
||||
vqadd.u64 d24, d24,d25
|
||||
vrshr.u64 d24, #30
|
||||
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||
vmull.u32 q12, d26,d28
|
||||
vmlal.u32 q12, d27,d29
|
||||
vqadd.u64 d24, d24,d25
|
||||
vrshr.u64 d24, #30
|
||||
|
||||
vst1.8 {d24[0]}, [r0]!
|
||||
add r9, r6
|
||||
vadd.u16 d6, d0 // inc u
|
||||
vshl.u16 d6, #1
|
||||
vshr.u16 d6, #1
|
||||
subs r10, #1
|
||||
bne _WIDTH
|
||||
vst1.8 {d24[0]}, [r0]!
|
||||
add r9, r6
|
||||
vadd.u16 d6, d0 // inc u
|
||||
vshl.u16 d6, #1
|
||||
vshr.u16 d6, #1
|
||||
subs r10, #1
|
||||
bne _WIDTH
|
||||
|
||||
WIDTH_END:
|
||||
lsr r9, #15
|
||||
lsr r9, #15
|
||||
add r4,r11,r9
|
||||
vld1.8 {d24[0]}, [r4]
|
||||
vst1.8 {d24[0]}, [r0]
|
||||
add r0, #1
|
||||
add r8, r7
|
||||
add r0, r1
|
||||
vadd.s16 d7, d5 // inc v
|
||||
vshl.u16 d7, #1
|
||||
vshr.u16 d7, #1
|
||||
subs r3, #1
|
||||
bne _HEIGHT
|
||||
vld1.8 {d24[0]}, [r4]
|
||||
vst1.8 {d24[0]}, [r0]
|
||||
add r0, #1
|
||||
add r8, r7
|
||||
add r0, r1
|
||||
vadd.s16 d7, d5 // inc v
|
||||
vshl.u16 d7, #1
|
||||
vshr.u16 d7, #1
|
||||
subs r3, #1
|
||||
bne _HEIGHT
|
||||
|
||||
LAST_ROW:
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
lsr r8, #15
|
||||
mul r8, r5
|
||||
add r4, r8 // get current row address
|
||||
mov r9, #16384
|
||||
lsr r8, #15
|
||||
mul r8, r5
|
||||
add r4, r8 // get current row address
|
||||
mov r9, #16384
|
||||
|
||||
_LAST_ROW_WIDTH:
|
||||
mov r11, r9
|
||||
lsr r11, #15
|
||||
mov r11, r9
|
||||
lsr r11, #15
|
||||
|
||||
add r3, r4,r11
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
vst1.8 {d0[0]}, [r0]
|
||||
add r0, #1
|
||||
add r9, r6
|
||||
subs r2, #1
|
||||
bne _LAST_ROW_WIDTH
|
||||
add r3, r4,r11
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
vst1.8 {d0[0]}, [r0]
|
||||
add r0, #1
|
||||
add r9, r6
|
||||
subs r2, #1
|
||||
bne _LAST_ROW_WIDTH
|
||||
|
||||
ldmia sp!, {r4-r12, lr}
|
||||
ldmia sp!, {r4-r12, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
||||
@@ -37,32 +37,32 @@
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
|
||||
stmdb sp!, {lr}
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q1, d0, d1
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q1, d0, d1
|
||||
|
||||
mov lr, #7
|
||||
mov lr, #7
|
||||
pixel_sad_8x8_loop0:
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
|
||||
subs lr, #1
|
||||
subs lr, #1
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q1, d0, d1
|
||||
bne pixel_sad_8x8_loop0
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q1, d0, d1
|
||||
bne pixel_sad_8x8_loop0
|
||||
|
||||
vadd.u16 d2, d3
|
||||
vpaddl.u16 d2, d2
|
||||
vpaddl.u32 d2, d2
|
||||
vmov.u32 r0, d2[0]//TBO...
|
||||
vadd.u16 d2, d3
|
||||
vpaddl.u16 d2, d2
|
||||
vpaddl.u32 d2, d2
|
||||
vmov.u32 r0, d2[0]//TBO...
|
||||
|
||||
ldmia sp!, {lr}
|
||||
ldmia sp!, {lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -56,217 +56,217 @@ sse2_20 times 8 dw 20
|
||||
;***********************************************************************
|
||||
SECTION .text
|
||||
|
||||
%macro WEIGHT_LINE 9
|
||||
movq %2, %9
|
||||
punpcklbw %2, %7
|
||||
movdqa %8, %2
|
||||
%macro WEIGHT_LINE 9
|
||||
movq %2, %9
|
||||
punpcklbw %2, %7
|
||||
movdqa %8, %2
|
||||
|
||||
movdqa %1, %6
|
||||
psubusb %1, %8
|
||||
psubusb %8, %6
|
||||
por %8, %1 ; ABS(curPixel - centerPixel);
|
||||
movdqa %1, %6
|
||||
psubusb %1, %8
|
||||
psubusb %8, %6
|
||||
por %8, %1 ; ABS(curPixel - centerPixel);
|
||||
|
||||
movdqa %1, %3
|
||||
psubusb %1, %8
|
||||
movdqa %1, %3
|
||||
psubusb %1, %8
|
||||
|
||||
pmullw %1, %1
|
||||
psrlw %1, 5
|
||||
pmullw %2, %1
|
||||
paddusw %4, %1
|
||||
paddusw %5, %2
|
||||
pmullw %1, %1
|
||||
psrlw %1, 5
|
||||
pmullw %2, %1
|
||||
paddusw %4, %1
|
||||
paddusw %5, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE1_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%macro WEIGHT_LINE1_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE2_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%macro WEIGHT_LINE2_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE3_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
%macro WEIGHT_LINE3_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
pmullw %2, [sse2_20]
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
pmullw %2, [sse2_20]
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
|
||||
;***********************************************************************
|
||||
; 1 2 3
|
||||
; 4 0 5
|
||||
; 6 7 8
|
||||
; 0: the center point
|
||||
; 1 2 3
|
||||
; 4 0 5
|
||||
; 6 7 8
|
||||
; 0: the center point
|
||||
|
||||
WELS_EXTERN BilateralLumaFilter8_sse2
|
||||
|
||||
push r3
|
||||
%assign push_num 1
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
push r3
|
||||
%assign push_num 1
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
pxor xmm7, xmm7
|
||||
pxor xmm7, xmm7
|
||||
|
||||
mov r3, r0
|
||||
mov r3, r0
|
||||
|
||||
movq xmm6, [r0]
|
||||
punpcklbw xmm6, xmm7
|
||||
movdqa xmm3, [sse2_32]
|
||||
pxor xmm4, xmm4 ; nTotWeight
|
||||
pxor xmm5, xmm5 ; nSum
|
||||
movq xmm6, [r0]
|
||||
punpcklbw xmm6, xmm7
|
||||
movdqa xmm3, [sse2_32]
|
||||
pxor xmm4, xmm4 ; nTotWeight
|
||||
pxor xmm5, xmm5 ; nSum
|
||||
|
||||
dec r0
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
|
||||
dec r0
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
|
||||
|
||||
sub r0, r1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
|
||||
sub r0, r1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
|
||||
|
||||
lea r0, [r0 + r1 * 2]
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
|
||||
lea r0, [r0 + r1 * 2]
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
|
||||
|
||||
pcmpeqw xmm0, xmm0
|
||||
psrlw xmm0, 15
|
||||
psllw xmm0, 8
|
||||
psubusw xmm0, xmm4
|
||||
pmullw xmm0, xmm6
|
||||
paddusw xmm5, xmm0
|
||||
psrlw xmm5, 8
|
||||
packuswb xmm5, xmm5
|
||||
movq [r3], xmm5
|
||||
pcmpeqw xmm0, xmm0
|
||||
psrlw xmm0, 15
|
||||
psllw xmm0, 8
|
||||
psubusw xmm0, xmm4
|
||||
pmullw xmm0, xmm6
|
||||
paddusw xmm5, xmm0
|
||||
psrlw xmm5, 8
|
||||
packuswb xmm5, xmm5
|
||||
movq [r3], xmm5
|
||||
|
||||
|
||||
POP_XMM
|
||||
pop r3
|
||||
%assign push_num 0
|
||||
POP_XMM
|
||||
pop r3
|
||||
%assign push_num 0
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
|
||||
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
|
||||
;***********************************************************************
|
||||
;5x5 filter:
|
||||
;1 1 2 1 1
|
||||
;1 2 4 2 1
|
||||
;2 4 20 4 2
|
||||
;1 2 4 2 1
|
||||
;1 1 2 1 1
|
||||
;1 1 2 1 1
|
||||
;1 2 4 2 1
|
||||
;2 4 20 4 2
|
||||
;1 2 4 2 1
|
||||
;1 1 2 1 1
|
||||
|
||||
WELS_EXTERN WaverageChromaFilter8_sse2
|
||||
|
||||
push r3
|
||||
push r3
|
||||
|
||||
%assign push_num 1
|
||||
%assign push_num 1
|
||||
|
||||
LOAD_2_PARA
|
||||
LOAD_2_PARA
|
||||
|
||||
mov r3, r1
|
||||
add r3, r3
|
||||
sub r0, r3 ; pixels - 2 * stride
|
||||
sub r0, 2
|
||||
mov r3, r1
|
||||
add r3, r3
|
||||
sub r0, r3 ; pixels - 2 * stride
|
||||
sub r0, 2
|
||||
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm3, xmm3
|
||||
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
add r0, r3
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
|
||||
add r0, r3
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
movdqu xmm1, [r0 + r1 * 2]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0 + r1 * 2]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
psrlw xmm3, 6
|
||||
packuswb xmm3, xmm3
|
||||
movq [r0 + 2], xmm3
|
||||
psrlw xmm3, 6
|
||||
packuswb xmm3, xmm3
|
||||
movq [r0 + 2], xmm3
|
||||
|
||||
|
||||
pop r3
|
||||
pop r3
|
||||
|
||||
%assign push_num 0
|
||||
ret
|
||||
%assign push_num 0
|
||||
ret
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user