Convert all tabs to spaces in assembly sources, unify indentation

Previously the assembly sources had mixed indentation consisting
of both spaces and tabs, making it quite hard to read unless
the right tab size was used in the editor.

Tabs have been interpreted as 4 spaces in most cases, matching
the surrounding code.
This commit is contained in:
Martin Storsjö
2014-05-31 14:13:34 +03:00
parent faaf62afad
commit 57f6bcc4b0
38 changed files with 19904 additions and 19904 deletions

View File

@@ -36,17 +36,17 @@
#ifdef __APPLE__
.macro SQR_ADD_16BYTES
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
vpadal.u16 $2, q8
vmull.u8 q3, $0, $0
vmull.u8 q8, $1, $1
vpadal.u16 $2, q3
vpadal.u16 $2, q8
.endm
#else
.macro SQR_ADD_16BYTES arg0, arg1, arg2
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
vmull.u8 q3, \arg0, \arg0
vmull.u8 q8, \arg1, \arg1
vpadal.u16 \arg2, q3
vpadal.u16 \arg2, q8
.endm
#endif
@@ -54,66 +54,66 @@
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
stmdb sp!, {r4}
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vabd.u8 q13, q14, q15
vmull.u8 q12, d27, d27
vmull.u8 q11, d26, d26
vaddl.u16 q12, d24, d25
vpadal.u16 q12, q11 //sqr
vaddl.u8 q13, d26, d27 //sum
vaddl.u8 q10, d28, d29 //sum_cur
vaddl.u8 q10, d28, d29 //sum_cur
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8
vmull.u8 q9, d29, d29
vmull.u8 q8, d28, d28
vaddl.u16 q9, d18, d19 //sqr_cur
vpadal.u16 q9, q8
mov r4, #15
mov r4, #15
pixel_var_16x16_loop0:
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
vabd.u8 q2, q0, q1
vabd.u8 q2, q0, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q10 save sum_cur
vpadal.u8 q10, q1
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q12 save sqr
SQR_ADD_16BYTES d4, d5, q12
//q13 save sum
vpadal.u8 q13, q2
vpadal.u8 q13, q2
subs r4, #1
subs r4, #1
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
//q9 save sqr_cur
SQR_ADD_16BYTES d2, d3, q9
bne pixel_var_16x16_loop0
bne pixel_var_16x16_loop0
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
vadd.u16 d0, d26, d27 //sum
vadd.u16 d1, d20, d21 //sum_cur
vpaddl.u16 q0, q0
vadd.u32 d2, d24, d25 //sqr
vadd.u32 d3, d18, d19 //sqr_cur
vpadd.u32 d0, d0, d1
vpadd.u32 d1, d2, d3
ldr r4, [sp, #4]
ldr r4, [sp, #4]
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
vshr.u32 q0, q0, #8
vmul.u32 d0, d0
vsub.u32 d0, d1, d0
vmovl.u32 q0, d0
vst2.16 {d0[0], d1[0]}, [r4]
vst2.16 {d0[0], d1[0]}, [r4]
ldmia sp!, {r4}
ldmia sp!, {r4}
WELS_ASM_FUNC_END

View File

@@ -30,313 +30,313 @@
*
*/
#ifdef HAVE_NEON
#ifdef HAVE_NEON
.text
#include "arm_arch_common_macro.S"
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
stmdb sp!, {r4-r8, lr}
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Get the width and height
ldr r4, [sp, #24] //src_width
ldr r5, [sp, #28] //src_height
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
//Initialize the register
mov r6, r2
mov r8, r0
mov lr, #0
lsr r5, #1
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
//Save the tailer for the unasigned size
mla r7, r1, r5, r0
vld1.32 {q15}, [r7]
add r7, r2, r3
//processing a colume data
add r7, r2, r3
//processing a colume data
comp_ds_bilinear_loop0:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
add lr, #32
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
add lr, #32
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
cmp lr, r4
movcs lr, #0
addcs r6, r6, r3, lsl #1
movcs r2, r6
addcs r7, r2, r3
addcs r8, r1
movcs r0, r8
subscs r5, #1
bne comp_ds_bilinear_loop0
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
//restore the tailer for the unasigned size
vst1.32 {q15}, [r0]
ldmia sp!, {r4-r8,lr}
ldmia sp!, {r4-r8,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr}
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
lsr r5, #1
//processing a colume data
//processing a colume data
comp_ds_bilinear_w_x8_loop0:
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
lsr r6, r4, #3
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x8_loop1:
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vld1.8 {d0}, [r2]!
vld1.8 {d1}, [r7]!
vpaddl.u8 q0, q0
vrshr.u16 q0, #1
vrhadd.u16 d0, d1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
vmovn.u16 d0, q0
vst1.32 {d0[0]}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x8_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x8_loop0
ldmia sp!, {r4-r7,lr}
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr}
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
lsr r5, #1
//processing a colume data
//processing a colume data
comp_ds_bilinear_w_x16_loop0:
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
lsr r6, r4, #4
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x16_loop1:
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vld1.8 {q0}, [r2]!
vld1.8 {q1}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrhadd.u16 q0, q1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
vmovn.u16 d0, q0
vst1.32 {d0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x16_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x16_loop0
ldmia sp!, {r4-r7,lr}
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
stmdb sp!, {r4-r7, lr}
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the width and height
ldr r4, [sp, #20] //src_width
ldr r5, [sp, #24] //src_height
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
//Get the difference
sub lr, r3, r4
sub r1, r1, r4, lsr #1
lsr r5, #1
lsr r5, #1
//processing a colume data
//processing a colume data
comp_ds_bilinear_w_x32_loop0:
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
lsr r6, r4, #5
add r7, r2, r3
//processing a line data
comp_ds_bilinear_w_x32_loop1:
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vld1.8 {q0,q1}, [r2]!
vld1.8 {q2,q3}, [r7]!
vpaddl.u8 q0, q0
vpaddl.u8 q1, q1
vpaddl.u8 q2, q2
vpaddl.u8 q3, q3
vrshr.u16 q0, #1
vrshr.u16 q1, #1
vrshr.u16 q2, #1
vrshr.u16 q3, #1
vrhadd.u16 q0, q2
vrhadd.u16 q1, q3
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
vmovn.u16 d0, q0
vmovn.u16 d1, q1
vst1.32 {q0}, [r0]!
subs r6, #1
bne comp_ds_bilinear_w_x32_loop1
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
add r2, r7, lr
add r0, r1
subs r5, #1
bne comp_ds_bilinear_w_x32_loop0
ldmia sp!, {r4-r7,lr}
ldmia sp!, {r4-r7,lr}
WELS_ASM_FUNC_END
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
stmdb sp!, {r4-r12, lr}
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
//Get the data from stack
ldr r4, [sp, #40] //the addr of src
ldr r5, [sp, #44] //the value of src_stride
ldr r6, [sp, #48] //the value of scaleX
ldr r7, [sp, #52] //the value of scaleY
mov r10, #32768
sub r10, #1
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
mov r11, #-1
mul r11, r8 // r11 -uinc
mul r11, r8 // r11 -uinc
vdup.s16 d2, r8
vdup.s16 d0, r11
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
mov r11, #-1
mul r11, r9 // r11 -vinc
mul r11, r9 // r11 -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
vdup.s16 d2, r9
vdup.s16 d3, r11
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
mov r11, #0x40000000
mov r11, #0x40000000
mov r12, #0x4000
sub r12, #1
add r11, r12
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
mov r11, #16384
mov r11, #16384
vdup.s16 d16, r11
sub r11, #1
vdup.s16 d17, r11
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
vdup.s16 d17, r11
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
veor q14, q14
sub r1, r2 // stride - width
mov r8, #16384 // yInverse
sub r3, #1
_HEIGHT:
ldr r4, [sp, #40] //the addr of src
mov r11, r8
lsr r11, #15
mul r11, r5
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r11, r8
lsr r11, #15
mul r11, r5
add r11, r4 // get current row address
mov r12, r11
add r12, r5
mov r9, #16384 // xInverse
sub r10, r2, #1
mov r9, #16384 // xInverse
sub r10, r2, #1
vmov.s16 d6, d1
_WIDTH:
mov lr, r9
lsr lr, #15
mov lr, r9
lsr lr, #15
add r4, r11,lr
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
add r4, r12,lr
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
vzip.32 d28, d29 //q14: 000d000c000b000a;
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vmull.u16 q13, d6, d7 //q13: init u * init v
vmull.u32 q12, d26,d28
vmlal.u32 q12, d27,d29
vqadd.u64 d24, d24,d25
vrshr.u64 d24, #30
vst1.8 {d24[0]}, [r0]!
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
vst1.8 {d24[0]}, [r0]!
add r9, r6
vadd.u16 d6, d0 // inc u
vshl.u16 d6, #1
vshr.u16 d6, #1
subs r10, #1
bne _WIDTH
WIDTH_END:
lsr r9, #15
lsr r9, #15
add r4,r11,r9
vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0]
add r0, #1
add r8, r7
add r0, r1
vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1
vshr.u16 d7, #1
subs r3, #1
bne _HEIGHT
vld1.8 {d24[0]}, [r4]
vst1.8 {d24[0]}, [r0]
add r0, #1
add r8, r7
add r0, r1
vadd.s16 d7, d5 // inc v
vshl.u16 d7, #1
vshr.u16 d7, #1
subs r3, #1
bne _HEIGHT
LAST_ROW:
ldr r4, [sp, #40] //the addr of src
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
mov r9, #16384
lsr r8, #15
mul r8, r5
add r4, r8 // get current row address
mov r9, #16384
_LAST_ROW_WIDTH:
mov r11, r9
lsr r11, #15
mov r11, r9
lsr r11, #15
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
add r3, r4,r11
vld1.8 {d0[0]}, [r3]
vst1.8 {d0[0]}, [r0]
add r0, #1
add r9, r6
subs r2, #1
bne _LAST_ROW_WIDTH
ldmia sp!, {r4-r12, lr}
ldmia sp!, {r4-r12, lr}
WELS_ASM_FUNC_END
#endif

View File

@@ -37,32 +37,32 @@
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
stmdb sp!, {lr}
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
//Do the SAD for 8 bytes
vabdl.u8 q1, d0, d1
mov lr, #7
mov lr, #7
pixel_sad_8x8_loop0:
//Loading a horizontal line data (8 bytes)
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
vld1.8 {d0}, [r0], r1
vld1.8 {d1}, [r2], r3
subs lr, #1
subs lr, #1
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
//Do the SAD for 8 bytes
vabal.u8 q1, d0, d1
bne pixel_sad_8x8_loop0
vadd.u16 d2, d3
vpaddl.u16 d2, d2
vpaddl.u32 d2, d2
vmov.u32 r0, d2[0]//TBO...
vadd.u16 d2, d3
vpaddl.u16 d2, d2
vpaddl.u32 d2, d2
vmov.u32 r0, d2[0]//TBO...
ldmia sp!, {lr}
ldmia sp!, {lr}
WELS_ASM_FUNC_END
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -56,217 +56,217 @@ sse2_20 times 8 dw 20
;***********************************************************************
SECTION .text
%macro WEIGHT_LINE 9
movq %2, %9
punpcklbw %2, %7
movdqa %8, %2
%macro WEIGHT_LINE 9
movq %2, %9
punpcklbw %2, %7
movdqa %8, %2
movdqa %1, %6
psubusb %1, %8
psubusb %8, %6
por %8, %1 ; ABS(curPixel - centerPixel);
movdqa %1, %6
psubusb %1, %8
psubusb %8, %6
por %8, %1 ; ABS(curPixel - centerPixel);
movdqa %1, %3
psubusb %1, %8
movdqa %1, %3
psubusb %1, %8
pmullw %1, %1
psrlw %1, 5
pmullw %2, %1
paddusw %4, %1
paddusw %5, %2
pmullw %1, %1
psrlw %1, 5
pmullw %2, %1
paddusw %4, %1
paddusw %5, %2
%endmacro
%macro WEIGHT_LINE1_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
%macro WEIGHT_LINE1_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
%endmacro
%macro WEIGHT_LINE2_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
%macro WEIGHT_LINE2_UV 4
movdqa %2, %1
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
paddw %3, %2
%endmacro
%macro WEIGHT_LINE3_UV 4
movdqa %2, %1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
%macro WEIGHT_LINE3_UV 4
movdqa %2, %1
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 1
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
pmullw %2, [sse2_20]
paddw %3, %2
movdqa %2, %1
psrldq %2, 2
punpcklbw %2, %4
pmullw %2, [sse2_20]
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 3
punpcklbw %2, %4
psllw %2, 2
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
movdqa %2, %1
psrldq %2, 4
punpcklbw %2, %4
psllw %2, 1
paddw %3, %2
%endmacro
;***********************************************************************
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
; 1 2 3
; 4 0 5
; 6 7 8
; 0: the center point
; 1 2 3
; 4 0 5
; 6 7 8
; 0: the center point
WELS_EXTERN BilateralLumaFilter8_sse2
push r3
%assign push_num 1
LOAD_2_PARA
PUSH_XMM 8
push r3
%assign push_num 1
LOAD_2_PARA
PUSH_XMM 8
pxor xmm7, xmm7
pxor xmm7, xmm7
mov r3, r0
mov r3, r0
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum
movq xmm6, [r0]
punpcklbw xmm6, xmm7
movdqa xmm3, [sse2_32]
pxor xmm4, xmm4 ; nTotWeight
pxor xmm5, xmm5 ; nSum
dec r0
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
dec r0
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
sub r0, r1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
sub r0, r1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
lea r0, [r0 + r1 * 2]
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
lea r0, [r0 + r1 * 2]
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
pcmpeqw xmm0, xmm0
psrlw xmm0, 15
psllw xmm0, 8
psubusw xmm0, xmm4
pmullw xmm0, xmm6
paddusw xmm5, xmm0
psrlw xmm5, 8
packuswb xmm5, xmm5
movq [r3], xmm5
pcmpeqw xmm0, xmm0
psrlw xmm0, 15
psllw xmm0, 8
psubusw xmm0, xmm4
pmullw xmm0, xmm6
paddusw xmm5, xmm0
psrlw xmm5, 8
packuswb xmm5, xmm5
movq [r3], xmm5
POP_XMM
pop r3
%assign push_num 0
POP_XMM
pop r3
%assign push_num 0
ret
ret
;***********************************************************************
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
;***********************************************************************
;5x5 filter:
;1 1 2 1 1
;1 2 4 2 1
;2 4 20 4 2
;1 2 4 2 1
;1 1 2 1 1
;1 1 2 1 1
;1 2 4 2 1
;2 4 20 4 2
;1 2 4 2 1
;1 1 2 1 1
WELS_EXTERN WaverageChromaFilter8_sse2
push r3
push r3
%assign push_num 1
%assign push_num 1
LOAD_2_PARA
LOAD_2_PARA
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
sub r0, 2
mov r3, r1
add r3, r3
sub r0, r3 ; pixels - 2 * stride
sub r0, 2
pxor xmm0, xmm0
pxor xmm3, xmm3
pxor xmm0, xmm0
pxor xmm3, xmm3
movdqu xmm1, [r0]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
add r0, r3
movdqu xmm1, [r0]
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
add r0, r3
movdqu xmm1, [r0]
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1]
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1 * 2]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
movdqu xmm1, [r0 + r1 * 2]
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
psrlw xmm3, 6
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
psrlw xmm3, 6
packuswb xmm3, xmm3
movq [r0 + 2], xmm3
pop r3
pop r3
%assign push_num 0
ret
%assign push_num 0
ret

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff