Convert all tabs to spaces in assembly sources, unify indentation
Previously the assembly sources had mixed indentation consisting of both spaces and tabs, making it quite hard to read unless the right tab size was used in the editor. Tabs have been interpreted as 4 spaces in most cases, matching the surrounding code.
This commit is contained in:
parent
faaf62afad
commit
57f6bcc4b0
@ -36,75 +36,75 @@
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro LOAD_ALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, src*, src_stride
|
||||
vld1.64 {$0}, [$4,:128], $5
|
||||
vld1.64 {$1}, [$4,:128], $5
|
||||
vld1.64 {$2}, [$4,:128], $5
|
||||
vld1.64 {$3}, [$4,:128], $5
|
||||
// }
|
||||
// { // input: $0~$3, src*, src_stride
|
||||
vld1.64 {$0}, [$4,:128], $5
|
||||
vld1.64 {$1}, [$4,:128], $5
|
||||
vld1.64 {$2}, [$4,:128], $5
|
||||
vld1.64 {$3}, [$4,:128], $5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_ALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, dst*, dst_stride
|
||||
vst1.64 {$0}, [$4,:128], $5
|
||||
vst1.64 {$1}, [$4,:128], $5
|
||||
vst1.64 {$2}, [$4,:128], $5
|
||||
vst1.64 {$3}, [$4,:128], $5
|
||||
// }
|
||||
// { // input: $0~$3, dst*, dst_stride
|
||||
vst1.64 {$0}, [$4,:128], $5
|
||||
vst1.64 {$1}, [$4,:128], $5
|
||||
vst1.64 {$2}, [$4,:128], $5
|
||||
vst1.64 {$3}, [$4,:128], $5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, src*, src_stride
|
||||
vld1.64 {$0}, [$4], $5
|
||||
vld1.64 {$1}, [$4], $5
|
||||
vld1.64 {$2}, [$4], $5
|
||||
vld1.64 {$3}, [$4], $5
|
||||
// }
|
||||
// { // input: $0~$3, src*, src_stride
|
||||
vld1.64 {$0}, [$4], $5
|
||||
vld1.64 {$1}, [$4], $5
|
||||
vld1.64 {$2}, [$4], $5
|
||||
vld1.64 {$3}, [$4], $5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_UNALIGNED_DATA_WITH_STRIDE
|
||||
// { // input: $0~$3, dst*, dst_stride
|
||||
vst1.64 {$0}, [$4], $5
|
||||
vst1.64 {$1}, [$4], $5
|
||||
vst1.64 {$2}, [$4], $5
|
||||
vst1.64 {$3}, [$4], $5
|
||||
// }
|
||||
// { // input: $0~$3, dst*, dst_stride
|
||||
vst1.64 {$0}, [$4], $5
|
||||
vst1.64 {$1}, [$4], $5
|
||||
vst1.64 {$2}, [$4], $5
|
||||
vst1.64 {$3}, [$4], $5
|
||||
// }
|
||||
.endm
|
||||
#else
|
||||
.macro LOAD_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, src*, src_stride
|
||||
vld1.64 {\arg0}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg1}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg2}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg3}, [\arg4,:128], \arg5
|
||||
// }
|
||||
// { // input: \arg0~\arg3, src*, src_stride
|
||||
vld1.64 {\arg0}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg1}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg2}, [\arg4,:128], \arg5
|
||||
vld1.64 {\arg3}, [\arg4,:128], \arg5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_ALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, dst*, dst_stride
|
||||
vst1.64 {\arg0}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg1}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg2}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg3}, [\arg4,:128], \arg5
|
||||
// }
|
||||
// { // input: \arg0~\arg3, dst*, dst_stride
|
||||
vst1.64 {\arg0}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg1}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg2}, [\arg4,:128], \arg5
|
||||
vst1.64 {\arg3}, [\arg4,:128], \arg5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro LOAD_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, src*, src_stride
|
||||
vld1.64 {\arg0}, [\arg4], \arg5
|
||||
vld1.64 {\arg1}, [\arg4], \arg5
|
||||
vld1.64 {\arg2}, [\arg4], \arg5
|
||||
vld1.64 {\arg3}, [\arg4], \arg5
|
||||
// }
|
||||
// { // input: \arg0~\arg3, src*, src_stride
|
||||
vld1.64 {\arg0}, [\arg4], \arg5
|
||||
vld1.64 {\arg1}, [\arg4], \arg5
|
||||
vld1.64 {\arg2}, [\arg4], \arg5
|
||||
vld1.64 {\arg3}, [\arg4], \arg5
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro STORE_UNALIGNED_DATA_WITH_STRIDE arg0, arg1, arg2, arg3, arg4, arg5
|
||||
// { // input: \arg0~\arg3, dst*, dst_stride
|
||||
vst1.64 {\arg0}, [\arg4], \arg5
|
||||
vst1.64 {\arg1}, [\arg4], \arg5
|
||||
vst1.64 {\arg2}, [\arg4], \arg5
|
||||
vst1.64 {\arg3}, [\arg4], \arg5
|
||||
// }
|
||||
// { // input: \arg0~\arg3, dst*, dst_stride
|
||||
vst1.64 {\arg0}, [\arg4], \arg5
|
||||
vst1.64 {\arg1}, [\arg4], \arg5
|
||||
vst1.64 {\arg2}, [\arg4], \arg5
|
||||
vst1.64 {\arg3}, [\arg4], \arg5
|
||||
// }
|
||||
.endm
|
||||
|
||||
#endif
|
||||
@ -112,89 +112,89 @@
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsCopy8x8_neon
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsCopy16x16_neon
|
||||
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
LOAD_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
STORE_ALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsCopy16x16NotAligned_neon
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsCopy16x8NotAligned_neon
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q0, q1, q2, q3, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE q8, q9, q10, q11, r0, r1
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsCopy8x16_neon
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d0, d1, d2, d3, r0, r1
|
||||
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
|
||||
LOAD_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r2, r3
|
||||
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
|
||||
STORE_UNALIGNED_DATA_WITH_STRIDE d4, d5, d6, d7, r0, r1
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -37,119 +37,119 @@
|
||||
|
||||
WELS_ASM_FUNC_BEGIN ExpandPictureLuma_neon
|
||||
stmdb sp!, {r4-r8}
|
||||
//Save the dst
|
||||
mov r7, r0
|
||||
mov r8, r3
|
||||
//Save the dst
|
||||
mov r7, r0
|
||||
mov r8, r3
|
||||
|
||||
add r4, r7, r2
|
||||
sub r4, #1
|
||||
add r4, r7, r2
|
||||
sub r4, #1
|
||||
//For the left and right expand
|
||||
_expand_picture_luma_loop2:
|
||||
sub r5, r7, #32
|
||||
add r6, r4, #1
|
||||
sub r5, r7, #32
|
||||
add r6, r4, #1
|
||||
|
||||
vld1.8 {d0[], d1[]}, [r7], r1
|
||||
vld1.8 {d2[], d3[]}, [r4], r1
|
||||
vld1.8 {d0[], d1[]}, [r7], r1
|
||||
vld1.8 {d2[], d3[]}, [r4], r1
|
||||
|
||||
vst1.8 {q0}, [r5]!
|
||||
vst1.8 {q0}, [r5]
|
||||
vst1.8 {q1}, [r6]!
|
||||
vst1.8 {q1}, [r6]
|
||||
subs r8, #1
|
||||
bne _expand_picture_luma_loop2
|
||||
vst1.8 {q0}, [r5]!
|
||||
vst1.8 {q0}, [r5]
|
||||
vst1.8 {q1}, [r6]!
|
||||
vst1.8 {q1}, [r6]
|
||||
subs r8, #1
|
||||
bne _expand_picture_luma_loop2
|
||||
|
||||
//for the top and bottom expand
|
||||
add r2, #64
|
||||
sub r0, #32
|
||||
mla r4, r1, r3, r0
|
||||
sub r4, r1
|
||||
//for the top and bottom expand
|
||||
add r2, #64
|
||||
sub r0, #32
|
||||
mla r4, r1, r3, r0
|
||||
sub r4, r1
|
||||
_expand_picture_luma_loop0:
|
||||
mov r5, #32
|
||||
mov r5, #32
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {q0}, [r0]!
|
||||
vld1.8 {q1}, [r4]!
|
||||
add r6, r4, r1
|
||||
vld1.8 {q0}, [r0]!
|
||||
vld1.8 {q1}, [r4]!
|
||||
|
||||
mov r8, #32
|
||||
mov r8, #32
|
||||
_expand_picture_luma_loop1:
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
subs r8, #1
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_luma_loop1
|
||||
|
||||
subs r2, #16
|
||||
bne _expand_picture_luma_loop0
|
||||
subs r2, #16
|
||||
bne _expand_picture_luma_loop0
|
||||
|
||||
//vldreq.32 d0, [r0]
|
||||
|
||||
ldmia sp!, {r4-r8}
|
||||
ldmia sp!, {r4-r8}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN ExpandPictureChroma_neon
|
||||
stmdb sp!, {r4-r9}
|
||||
//Save the dst
|
||||
mov r7, r0
|
||||
mov r8, r3
|
||||
//Save the dst
|
||||
mov r7, r0
|
||||
mov r8, r3
|
||||
|
||||
add r4, r7, r2
|
||||
sub r4, #1
|
||||
add r4, r7, r2
|
||||
sub r4, #1
|
||||
//For the left and right expand
|
||||
_expand_picture_chroma_loop2:
|
||||
sub r5, r7, #16
|
||||
add r6, r4, #1
|
||||
sub r5, r7, #16
|
||||
add r6, r4, #1
|
||||
|
||||
vld1.8 {d0[], d1[]}, [r7], r1
|
||||
vld1.8 {d2[], d3[]}, [r4], r1
|
||||
vld1.8 {d0[], d1[]}, [r7], r1
|
||||
vld1.8 {d2[], d3[]}, [r4], r1
|
||||
|
||||
vst1.8 {q0}, [r5]
|
||||
vst1.8 {q1}, [r6]
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop2
|
||||
vst1.8 {q0}, [r5]
|
||||
vst1.8 {q1}, [r6]
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop2
|
||||
|
||||
//for the top and bottom expand
|
||||
add r2, #32
|
||||
mov r9, r2
|
||||
bic r2, #15
|
||||
sub r0, #16
|
||||
mla r4, r1, r3, r0
|
||||
sub r4, r1
|
||||
//for the top and bottom expand
|
||||
add r2, #32
|
||||
mov r9, r2
|
||||
bic r2, #15
|
||||
sub r0, #16
|
||||
mla r4, r1, r3, r0
|
||||
sub r4, r1
|
||||
_expand_picture_chroma_loop0:
|
||||
mov r5, #16
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {q0}, [r0]!
|
||||
vld1.8 {q1}, [r4]!
|
||||
mov r5, #16
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {q0}, [r0]!
|
||||
vld1.8 {q1}, [r4]!
|
||||
|
||||
mov r8, #16
|
||||
mov r8, #16
|
||||
_expand_picture_chroma_loop1:
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop1
|
||||
vst1.8 {q0}, [r5], r1
|
||||
vst1.8 {q1}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop1
|
||||
|
||||
subs r2, #16
|
||||
bne _expand_picture_chroma_loop0
|
||||
subs r2, #16
|
||||
bne _expand_picture_chroma_loop0
|
||||
|
||||
//vldreq.32 d0, [r0]
|
||||
|
||||
and r9, #15
|
||||
cmp r9, #8
|
||||
bne _expand_picture_chroma_end
|
||||
mov r5, #16
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {d0}, [r0]!
|
||||
vld1.8 {d2}, [r4]!
|
||||
mov r8, #16
|
||||
and r9, #15
|
||||
cmp r9, #8
|
||||
bne _expand_picture_chroma_end
|
||||
mov r5, #16
|
||||
mls r5, r5, r1, r0
|
||||
add r6, r4, r1
|
||||
vld1.8 {d0}, [r0]!
|
||||
vld1.8 {d2}, [r4]!
|
||||
mov r8, #16
|
||||
_expand_picture_chroma_loop3:
|
||||
vst1.8 {d0}, [r5], r1
|
||||
vst1.8 {d2}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop3
|
||||
vst1.8 {d0}, [r5], r1
|
||||
vst1.8 {d2}, [r6], r1
|
||||
subs r8, #1
|
||||
bne _expand_picture_chroma_loop3
|
||||
_expand_picture_chroma_end:
|
||||
|
||||
ldmia sp!, {r4-r9}
|
||||
ldmia sp!, {r4-r9}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -53,88 +53,88 @@ _expand_picture_luma_loop2:
|
||||
sub x8, x8, #1
|
||||
cbnz x8, _expand_picture_luma_loop2
|
||||
//for the top and bottom expand
|
||||
add x2, x2, #64
|
||||
sub x0, x0, #32
|
||||
add x2, x2, #64
|
||||
sub x0, x0, #32
|
||||
madd x4, x1, x3, x0
|
||||
sub x4, x4, x1
|
||||
_expand_picture_luma_loop0:
|
||||
mov x5, #32
|
||||
mov x5, #32
|
||||
msub x5, x5, x1, x0
|
||||
add x6, x4, x1
|
||||
add x6, x4, x1
|
||||
ld1 {v0.16b}, [x0], x10
|
||||
ld1 {v1.16b}, [x4], x10
|
||||
mov x8, #32
|
||||
mov x8, #32
|
||||
_expand_picture_luma_loop1:
|
||||
st1 {v0.16b}, [x5], x1
|
||||
st1 {v1.16b}, [x6], x1
|
||||
sub x8, x8, #1
|
||||
st1 {v0.16b}, [x5], x1
|
||||
st1 {v1.16b}, [x6], x1
|
||||
sub x8, x8, #1
|
||||
cbnz x8, _expand_picture_luma_loop1
|
||||
|
||||
sub x2, x2, #16
|
||||
cbnz x2, _expand_picture_luma_loop0
|
||||
sub x2, x2, #16
|
||||
cbnz x2, _expand_picture_luma_loop0
|
||||
WELS_ASM_ARCH64_FUNC_END
|
||||
|
||||
WELS_ASM_ARCH64_FUNC_BEGIN ExpandPictureChroma_AArch64_neon
|
||||
//Save the dst
|
||||
mov x7, x0
|
||||
mov x8, x3
|
||||
//Save the dst
|
||||
mov x7, x0
|
||||
mov x8, x3
|
||||
mov x10, #16
|
||||
add x4, x7, x2
|
||||
sub x4, x4, #1
|
||||
add x4, x7, x2
|
||||
sub x4, x4, #1
|
||||
//For the left and right expand
|
||||
_expand_picture_chroma_loop2:
|
||||
sub x5, x7, #16
|
||||
add x6, x4, #1
|
||||
sub x5, x7, #16
|
||||
add x6, x4, #1
|
||||
|
||||
ld1r {v0.16b}, [x7], x1
|
||||
ld1r {v1.16b}, [x4], x1
|
||||
ld1r {v0.16b}, [x7], x1
|
||||
ld1r {v1.16b}, [x4], x1
|
||||
|
||||
st1 {v0.16b}, [x5]
|
||||
st1 {v1.16b}, [x6]
|
||||
sub x8, x8, #1
|
||||
cbnz x8, _expand_picture_chroma_loop2
|
||||
st1 {v0.16b}, [x5]
|
||||
st1 {v1.16b}, [x6]
|
||||
sub x8, x8, #1
|
||||
cbnz x8, _expand_picture_chroma_loop2
|
||||
|
||||
//for the top and bottom expand
|
||||
add x2, x2, #32
|
||||
//for the top and bottom expand
|
||||
add x2, x2, #32
|
||||
//
|
||||
mov x9, x2
|
||||
mov x11, #15
|
||||
bic x2, x2, x11
|
||||
//
|
||||
sub x0, x0, #16
|
||||
madd x4, x1, x3, x0
|
||||
sub x4, x4, x1
|
||||
sub x0, x0, #16
|
||||
madd x4, x1, x3, x0
|
||||
sub x4, x4, x1
|
||||
_expand_picture_chroma_loop0:
|
||||
mov x5, #16
|
||||
mov x5, #16
|
||||
msub x5, x5, x1, x0
|
||||
add x6, x4, x1
|
||||
ld1 {v0.16b}, [x0], x10
|
||||
ld1 {v1.16b}, [x4], x10
|
||||
add x6, x4, x1
|
||||
ld1 {v0.16b}, [x0], x10
|
||||
ld1 {v1.16b}, [x4], x10
|
||||
|
||||
mov x8, #16
|
||||
mov x8, #16
|
||||
_expand_picture_chroma_loop1:
|
||||
st1 {v0.16b}, [x5], x1
|
||||
st1 {v1.16b}, [x6], x1
|
||||
sub x8, x8, #1
|
||||
st1 {v0.16b}, [x5], x1
|
||||
st1 {v1.16b}, [x6], x1
|
||||
sub x8, x8, #1
|
||||
cbnz x8, _expand_picture_chroma_loop1
|
||||
|
||||
sub x2, x2, #16
|
||||
cbnz x2, _expand_picture_chroma_loop0
|
||||
sub x2, x2, #16
|
||||
cbnz x2, _expand_picture_chroma_loop0
|
||||
|
||||
and x9, x9, #15
|
||||
sub x9, x9, #8
|
||||
cbnz x9, _expand_picture_chroma_end
|
||||
mov x5, #16
|
||||
mov x5, #16
|
||||
msub x5, x5, x1, x0
|
||||
add x6, x4, x1
|
||||
ld1 {v0.8b}, [x0]
|
||||
ld1 {v1.8b}, [x4]
|
||||
add x6, x4, x1
|
||||
ld1 {v0.8b}, [x0]
|
||||
ld1 {v1.8b}, [x4]
|
||||
|
||||
mov x8, #16
|
||||
mov x8, #16
|
||||
_expand_picture_chroma_loop3:
|
||||
st1 {v0.8b}, [x5], x1
|
||||
st1 {v1.8b}, [x6], x1
|
||||
sub x8, x8, #1
|
||||
st1 {v0.8b}, [x5], x1
|
||||
st1 {v1.8b}, [x6], x1
|
||||
sub x8, x8, #1
|
||||
cbnz x8, _expand_picture_chroma_loop3
|
||||
_expand_picture_chroma_end:
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -44,15 +44,15 @@
|
||||
;***********************************************************************
|
||||
|
||||
%if 1
|
||||
%define MOVDQ movdqa
|
||||
%define MOVDQ movdqa
|
||||
%else
|
||||
%define MOVDQ movdqu
|
||||
%define MOVDQ movdqu
|
||||
%endif
|
||||
|
||||
%if 1
|
||||
%define WELSEMMS emms
|
||||
%define WELSEMMS emms
|
||||
%else
|
||||
%define WELSEMMS
|
||||
%define WELSEMMS
|
||||
%endif
|
||||
|
||||
|
||||
@ -220,7 +220,7 @@ BITS 32
|
||||
|
||||
%macro LOAD_1_PARA 0
|
||||
%ifdef X86_32
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
@ -234,8 +234,8 @@ BITS 32
|
||||
%macro LOAD_3_PARA 0
|
||||
%ifdef X86_32
|
||||
mov r0, [esp + push_num*4 + 4]
|
||||
mov r1, [esp + push_num*4 + 8]
|
||||
mov r2, [esp + push_num*4 + 12]
|
||||
mov r1, [esp + push_num*4 + 8]
|
||||
mov r2, [esp + push_num*4 + 12]
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
@ -267,7 +267,7 @@ BITS 32
|
||||
|
||||
%macro LOAD_6_PARA 0
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
push r3
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num push_num+3
|
||||
@ -310,22 +310,22 @@ BITS 32
|
||||
|
||||
%macro LOAD_4_PARA_POP 0
|
||||
%ifdef X86_32
|
||||
pop r3
|
||||
pop r3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_5_PARA_POP 0
|
||||
%ifdef X86_32
|
||||
pop r4
|
||||
pop r3
|
||||
pop r3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro LOAD_6_PARA_POP 0
|
||||
%ifdef X86_32
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
@ -416,13 +416,13 @@ BITS 32
|
||||
|
||||
%macro SIGN_EXTENSION 2
|
||||
%ifndef X86_32
|
||||
movsxd %1, %2
|
||||
movsxd %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro SIGN_EXTENSIONW 2
|
||||
%ifndef X86_32
|
||||
movsx %1, %2
|
||||
movsx %1, %2
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
@ -438,13 +438,13 @@ BITS 32
|
||||
%endmacro
|
||||
|
||||
%macro WELS_AbsW 2
|
||||
pxor %2, %2
|
||||
pxor %2, %2
|
||||
psubw %2, %1
|
||||
pmaxsw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro MMX_XSwap 4
|
||||
movq %4, %2
|
||||
movq %4, %2
|
||||
punpckh%1 %4, %3
|
||||
punpckl%1 %2, %3
|
||||
%endmacro
|
||||
@ -485,35 +485,35 @@ BITS 32
|
||||
;in: m1, m2, m3, m4, m5, m6, m7, m8
|
||||
;pOut: m5, m3, m4, m8, m6, m2, m7, m1
|
||||
%macro SSE2_TransTwo8x8B 9
|
||||
movdqa %9, %8
|
||||
SSE2_XSawp bw, %1, %2, %8
|
||||
SSE2_XSawp bw, %3, %4, %2
|
||||
SSE2_XSawp bw, %5, %6, %4
|
||||
movdqa %6, %9
|
||||
movdqa %9, %4
|
||||
SSE2_XSawp bw, %7, %6, %4
|
||||
movdqa %9, %8
|
||||
SSE2_XSawp bw, %1, %2, %8
|
||||
SSE2_XSawp bw, %3, %4, %2
|
||||
SSE2_XSawp bw, %5, %6, %4
|
||||
movdqa %6, %9
|
||||
movdqa %9, %4
|
||||
SSE2_XSawp bw, %7, %6, %4
|
||||
|
||||
SSE2_XSawp wd, %1, %3, %6
|
||||
SSE2_XSawp wd, %8, %2, %3
|
||||
SSE2_XSawp wd, %5, %7, %2
|
||||
movdqa %7, %9
|
||||
movdqa %9, %3
|
||||
SSE2_XSawp wd, %7, %4, %3
|
||||
SSE2_XSawp wd, %1, %3, %6
|
||||
SSE2_XSawp wd, %8, %2, %3
|
||||
SSE2_XSawp wd, %5, %7, %2
|
||||
movdqa %7, %9
|
||||
movdqa %9, %3
|
||||
SSE2_XSawp wd, %7, %4, %3
|
||||
|
||||
SSE2_XSawp dq, %1, %5, %4
|
||||
SSE2_XSawp dq, %6, %2, %5
|
||||
SSE2_XSawp dq, %8, %7, %2
|
||||
movdqa %7, %9
|
||||
movdqa %9, %5
|
||||
SSE2_XSawp dq, %7, %3, %5
|
||||
SSE2_XSawp dq, %1, %5, %4
|
||||
SSE2_XSawp dq, %6, %2, %5
|
||||
SSE2_XSawp dq, %8, %7, %2
|
||||
movdqa %7, %9
|
||||
movdqa %9, %5
|
||||
SSE2_XSawp dq, %7, %3, %5
|
||||
|
||||
SSE2_XSawp qdq, %1, %8, %3
|
||||
SSE2_XSawp qdq, %4, %2, %8
|
||||
SSE2_XSawp qdq, %6, %7, %2
|
||||
movdqa %7, %9
|
||||
movdqa %9, %1
|
||||
SSE2_XSawp qdq, %7, %5, %1
|
||||
movdqa %5, %9
|
||||
SSE2_XSawp qdq, %1, %8, %3
|
||||
SSE2_XSawp qdq, %4, %2, %8
|
||||
SSE2_XSawp qdq, %6, %7, %2
|
||||
movdqa %7, %9
|
||||
movdqa %9, %1
|
||||
SSE2_XSawp qdq, %7, %5, %1
|
||||
movdqa %5, %9
|
||||
%endmacro
|
||||
|
||||
;xmm0, xmm6, xmm7, [eax], [ecx]
|
||||
@ -528,32 +528,32 @@ BITS 32
|
||||
|
||||
; m2 = m1 + m2, m1 = m1 - m2
|
||||
%macro SSE2_SumSub 3
|
||||
movdqa %3, %2
|
||||
movdqa %3, %2
|
||||
paddw %2, %1
|
||||
psubw %1, %3
|
||||
%endmacro
|
||||
|
||||
|
||||
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
|
||||
mov %3h, %3l
|
||||
movd %1, e%3x ; i.e, 1% = eax (=b0)
|
||||
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
|
||||
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
|
||||
%macro butterfly_1to16_sse 3 ; xmm? for dst, xmm? for tmp, one byte for pSrc [generic register name: a/b/c/d]
|
||||
mov %3h, %3l
|
||||
movd %1, e%3x ; i.e, 1% = eax (=b0)
|
||||
pshuflw %2, %1, 00h ; ..., b0 b0 b0 b0 b0 b0 b0 b0
|
||||
pshufd %1, %2, 00h ; b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0, b0 b0 b0 b0
|
||||
%endmacro
|
||||
|
||||
;copy a dw into a xmm for 8 times
|
||||
%macro SSE2_Copy8Times 2
|
||||
movd %1, %2
|
||||
punpcklwd %1, %1
|
||||
pshufd %1, %1, 0
|
||||
movd %1, %2
|
||||
punpcklwd %1, %1
|
||||
pshufd %1, %1, 0
|
||||
%endmacro
|
||||
|
||||
;copy a db into a xmm for 16 times
|
||||
%macro SSE2_Copy16Times 2
|
||||
movd %1, %2
|
||||
pshuflw %1, %1, 0
|
||||
punpcklqdq %1, %1
|
||||
packuswb %1, %1
|
||||
movd %1, %2
|
||||
pshuflw %1, %1, 0
|
||||
punpcklqdq %1, %1
|
||||
packuswb %1, %1
|
||||
%endmacro
|
||||
|
||||
|
||||
@ -564,35 +564,35 @@ BITS 32
|
||||
;dw 32,32,32,32,32,32,32,32 for xmm
|
||||
;dw 32,32,32,32 for mm
|
||||
%macro WELS_DW32 1
|
||||
pcmpeqw %1,%1
|
||||
psrlw %1,15
|
||||
psllw %1,5
|
||||
pcmpeqw %1,%1
|
||||
psrlw %1,15
|
||||
psllw %1,5
|
||||
%endmacro
|
||||
|
||||
;dw 1, 1, 1, 1, 1, 1, 1, 1 for xmm
|
||||
;dw 1, 1, 1, 1 for mm
|
||||
%macro WELS_DW1 1
|
||||
pcmpeqw %1,%1
|
||||
psrlw %1,15
|
||||
pcmpeqw %1,%1
|
||||
psrlw %1,15
|
||||
%endmacro
|
||||
|
||||
;all 0 for xmm and mm
|
||||
%macro WELS_Zero 1
|
||||
pxor %1, %1
|
||||
pxor %1, %1
|
||||
%endmacro
|
||||
|
||||
;dd 1, 1, 1, 1 for xmm
|
||||
;dd 1, 1 for mm
|
||||
%macro WELS_DD1 1
|
||||
pcmpeqw %1,%1
|
||||
psrld %1,31
|
||||
pcmpeqw %1,%1
|
||||
psrld %1,31
|
||||
%endmacro
|
||||
|
||||
;dB 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
|
||||
%macro WELS_DB1 1
|
||||
pcmpeqw %1,%1
|
||||
psrlw %1,15
|
||||
packuswb %1,%1
|
||||
pcmpeqw %1,%1
|
||||
psrlw %1,15
|
||||
packuswb %1,%1
|
||||
%endmacro
|
||||
|
||||
|
||||
|
@ -29,13 +29,13 @@
|
||||
;* POSSIBILITY OF SUCH DAMAGE.
|
||||
;*
|
||||
;*
|
||||
;* cpu_mmx.asm
|
||||
;* cpu_mmx.asm
|
||||
;*
|
||||
;* Abstract
|
||||
;* verify cpuid feature support and cpuid detection
|
||||
;* verify cpuid feature support and cpuid detection
|
||||
;*
|
||||
;* History
|
||||
;* 04/29/2009 Created
|
||||
;* 04/29/2009 Created
|
||||
;*
|
||||
;*************************************************************************/
|
||||
|
||||
@ -115,13 +115,13 @@ WELS_EXTERN WelsCPUId
|
||||
%elifdef X86_32
|
||||
|
||||
WELS_EXTERN WelsCPUId
|
||||
push ebx
|
||||
push edi
|
||||
push ebx
|
||||
push edi
|
||||
|
||||
mov eax, [esp+12] ; operating index
|
||||
mov eax, [esp+12] ; operating index
|
||||
mov edi, [esp+24]
|
||||
mov ecx, [edi]
|
||||
cpuid ; cpuid
|
||||
cpuid ; cpuid
|
||||
|
||||
; processing various information return
|
||||
mov edi, [esp+16]
|
||||
@ -133,7 +133,7 @@ WELS_EXTERN WelsCPUId
|
||||
mov edi, [esp+28]
|
||||
mov [edi], edx
|
||||
|
||||
pop edi
|
||||
pop edi
|
||||
pop ebx
|
||||
ret
|
||||
|
||||
@ -145,31 +145,31 @@ WELS_EXTERN WelsCPUId
|
||||
;****************************************************************************************************
|
||||
WELS_EXTERN WelsCPUSupportAVX
|
||||
%ifdef WIN64
|
||||
mov eax, ecx
|
||||
mov ecx, edx
|
||||
mov eax, ecx
|
||||
mov ecx, edx
|
||||
%elifdef UNIX64
|
||||
mov eax, edi
|
||||
mov ecx, esi
|
||||
mov eax, edi
|
||||
mov ecx, esi
|
||||
%else
|
||||
mov eax, [esp+4]
|
||||
mov ecx, [esp+8]
|
||||
mov eax, [esp+4]
|
||||
mov ecx, [esp+8]
|
||||
%endif
|
||||
|
||||
; refer to detection of AVX addressed in INTEL AVX manual document
|
||||
and ecx, 018000000H
|
||||
cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
|
||||
jne avx_not_supported
|
||||
; processor supports AVX instructions and XGETBV is enabled by OS
|
||||
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
|
||||
XGETBV ; result in EDX:EAX
|
||||
and eax, 06H
|
||||
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
|
||||
jne avx_not_supported
|
||||
mov eax, 1
|
||||
ret
|
||||
; refer to detection of AVX addressed in INTEL AVX manual document
|
||||
and ecx, 018000000H
|
||||
cmp ecx, 018000000H ; check both OSXSAVE and AVX feature flags
|
||||
jne avx_not_supported
|
||||
; processor supports AVX instructions and XGETBV is enabled by OS
|
||||
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
|
||||
XGETBV ; result in EDX:EAX
|
||||
and eax, 06H
|
||||
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
|
||||
jne avx_not_supported
|
||||
mov eax, 1
|
||||
ret
|
||||
avx_not_supported:
|
||||
mov eax, 0
|
||||
ret
|
||||
mov eax, 0
|
||||
ret
|
||||
|
||||
|
||||
; need call after cpuid=1 and eax, ecx flag got then
|
||||
@ -178,35 +178,35 @@ avx_not_supported:
|
||||
;****************************************************************************************************
|
||||
WELS_EXTERN WelsCPUSupportFMA
|
||||
%ifdef WIN64
|
||||
mov eax, ecx
|
||||
mov ecx, edx
|
||||
mov eax, ecx
|
||||
mov ecx, edx
|
||||
%elifdef UNIX64
|
||||
mov eax, edi
|
||||
mov ecx, esi
|
||||
mov eax, edi
|
||||
mov ecx, esi
|
||||
%else
|
||||
mov eax, [esp+4]
|
||||
mov ecx, [esp+8]
|
||||
mov eax, [esp+4]
|
||||
mov ecx, [esp+8]
|
||||
%endif
|
||||
; refer to detection of FMA addressed in INTEL AVX manual document
|
||||
and ecx, 018001000H
|
||||
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
|
||||
jne fma_not_supported
|
||||
; processor supports AVX,FMA instructions and XGETBV is enabled by OS
|
||||
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
|
||||
XGETBV ; result in EDX:EAX
|
||||
and eax, 06H
|
||||
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
|
||||
jne fma_not_supported
|
||||
mov eax, 1
|
||||
ret
|
||||
; refer to detection of FMA addressed in INTEL AVX manual document
|
||||
and ecx, 018001000H
|
||||
cmp ecx, 018001000H ; check OSXSAVE, AVX, FMA feature flags
|
||||
jne fma_not_supported
|
||||
; processor supports AVX,FMA instructions and XGETBV is enabled by OS
|
||||
mov ecx, 0 ; specify 0 for XFEATURE_ENABLED_MASK register
|
||||
XGETBV ; result in EDX:EAX
|
||||
and eax, 06H
|
||||
cmp eax, 06H ; check OS has enabled both XMM and YMM state support
|
||||
jne fma_not_supported
|
||||
mov eax, 1
|
||||
ret
|
||||
fma_not_supported:
|
||||
mov eax, 0
|
||||
ret
|
||||
mov eax, 0
|
||||
ret
|
||||
|
||||
;******************************************************************************************
|
||||
; void WelsEmms()
|
||||
;******************************************************************************************
|
||||
WELS_EXTERN WelsEmms
|
||||
emms ; empty mmx technology states
|
||||
ret
|
||||
emms ; empty mmx technology states
|
||||
ret
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -77,280 +77,280 @@ SECTION .text
|
||||
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
||||
;cccc|ceeeeeeeeeeeeeeeed|dddd
|
||||
|
||||
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+2*%2]
|
||||
%macro mov_line_8x4_mmx 3 ; dst, stride, mm?
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+2*%2]
|
||||
%endmacro
|
||||
|
||||
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+%2]
|
||||
%macro mov_line_end8x4_mmx 3 ; dst, stride, mm?
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], %3
|
||||
movq [%1+%2], %3
|
||||
lea %1, [%1+%2]
|
||||
%endmacro
|
||||
|
||||
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
|
||||
movdq%4 [%1], %3 ; top(bottom)_0
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdq%4 [%1], %3 ; top(bottom)_2
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_3
|
||||
lea %1, [%1+2*%2]
|
||||
%macro mov_line_16x4_sse2 4 ; dst, stride, xmm?, u/a
|
||||
movdq%4 [%1], %3 ; top(bottom)_0
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdq%4 [%1], %3 ; top(bottom)_2
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_3
|
||||
lea %1, [%1+2*%2]
|
||||
%endmacro
|
||||
|
||||
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
|
||||
movdq%4 [%1], %3 ; top(bottom)_0
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdq%4 [%1], %3 ; top(bottom)_2
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_3
|
||||
lea %1, [%1+%2]
|
||||
%macro mov_line_end16x4_sse2 4 ; dst, stride, xmm?, u/a
|
||||
movdq%4 [%1], %3 ; top(bottom)_0
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdq%4 [%1], %3 ; top(bottom)_2
|
||||
movdq%4 [%1+%2], %3 ; top(bottom)_3
|
||||
lea %1, [%1+%2]
|
||||
%endmacro
|
||||
|
||||
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
|
||||
movdqa [%1], %3 ; top(bottom)_0
|
||||
movdqa [%1+16], %3 ; top(bottom)_0
|
||||
movdqa [%1+%2], %3 ; top(bottom)_1
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdqa [%1], %3 ; top(bottom)_2
|
||||
movdqa [%1+16], %3 ; top(bottom)_2
|
||||
movdqa [%1+%2], %3 ; top(bottom)_3
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_3
|
||||
lea %1, [%1+2*%2]
|
||||
%macro mov_line_32x4_sse2 3 ; dst, stride, xmm?
|
||||
movdqa [%1], %3 ; top(bottom)_0
|
||||
movdqa [%1+16], %3 ; top(bottom)_0
|
||||
movdqa [%1+%2], %3 ; top(bottom)_1
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdqa [%1], %3 ; top(bottom)_2
|
||||
movdqa [%1+16], %3 ; top(bottom)_2
|
||||
movdqa [%1+%2], %3 ; top(bottom)_3
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_3
|
||||
lea %1, [%1+2*%2]
|
||||
%endmacro
|
||||
|
||||
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
|
||||
movdqa [%1], %3 ; top(bottom)_0
|
||||
movdqa [%1+16], %3 ; top(bottom)_0
|
||||
movdqa [%1+%2], %3 ; top(bottom)_1
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdqa [%1], %3 ; top(bottom)_2
|
||||
movdqa [%1+16], %3 ; top(bottom)_2
|
||||
movdqa [%1+%2], %3 ; top(bottom)_3
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_3
|
||||
lea %1, [%1+%2]
|
||||
%macro mov_line_end32x4_sse2 3 ; dst, stride, xmm?
|
||||
movdqa [%1], %3 ; top(bottom)_0
|
||||
movdqa [%1+16], %3 ; top(bottom)_0
|
||||
movdqa [%1+%2], %3 ; top(bottom)_1
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_1
|
||||
lea %1, [%1+2*%2]
|
||||
movdqa [%1], %3 ; top(bottom)_2
|
||||
movdqa [%1+16], %3 ; top(bottom)_2
|
||||
movdqa [%1+%2], %3 ; top(bottom)_3
|
||||
movdqa [%1+%2+16], %3 ; top(bottom)_3
|
||||
lea %1, [%1+%2]
|
||||
%endmacro
|
||||
|
||||
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
|
||||
%macro exp_top_bottom_sse2 1 ; iPaddingSize [luma(32)/chroma(16)]
|
||||
;r2 [width/16(8)]
|
||||
;r0 [pSrc +0], r5 [pSrc -width] r1[-stride], 32(16) ;top
|
||||
;r3 [pSrc +(h-1)*stride], r4 [pSrc + (h+31)*stride],32(16); bottom
|
||||
|
||||
%if %1 == 32 ; for luma
|
||||
sar r2, 04h ; width / 16(8) pixels
|
||||
%if %1 == 32 ; for luma
|
||||
sar r2, 04h ; width / 16(8) pixels
|
||||
.top_bottom_loops:
|
||||
; top
|
||||
movdqa xmm0, [r0] ; first line of picture pData
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_end16x4_sse2 r5, r1, xmm0, a
|
||||
; top
|
||||
movdqa xmm0, [r0] ; first line of picture pData
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_end16x4_sse2 r5, r1, xmm0, a
|
||||
|
||||
; bottom
|
||||
movdqa xmm1, [r3] ; last line of picture pData
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_end16x4_sse2 r4, r1, xmm1, a
|
||||
; bottom
|
||||
movdqa xmm1, [r3] ; last line of picture pData
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_end16x4_sse2 r4, r1, xmm1, a
|
||||
|
||||
lea r0, [r0+16] ; top pSrc
|
||||
lea r5, [r5+16] ; top dst
|
||||
lea r3, [r3+16] ; bottom pSrc
|
||||
lea r4, [r4+16] ; bottom dst
|
||||
neg r1 ; positive/negative stride need for next loop?
|
||||
lea r0, [r0+16] ; top pSrc
|
||||
lea r5, [r5+16] ; top dst
|
||||
lea r3, [r3+16] ; bottom pSrc
|
||||
lea r4, [r4+16] ; bottom dst
|
||||
neg r1 ; positive/negative stride need for next loop?
|
||||
|
||||
dec r2
|
||||
jnz near .top_bottom_loops
|
||||
%elif %1 == 16 ; for chroma ??
|
||||
mov r6, r2
|
||||
sar r2, 04h ; (width / 16) pixels
|
||||
dec r2
|
||||
jnz near .top_bottom_loops
|
||||
%elif %1 == 16 ; for chroma ??
|
||||
mov r6, r2
|
||||
sar r2, 04h ; (width / 16) pixels
|
||||
.top_bottom_loops:
|
||||
; top
|
||||
movdqa xmm0, [r0] ; first line of picture pData
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_end16x4_sse2 r5, r1, xmm0, a
|
||||
; top
|
||||
movdqa xmm0, [r0] ; first line of picture pData
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_16x4_sse2 r5, r1, xmm0, a
|
||||
mov_line_end16x4_sse2 r5, r1, xmm0, a
|
||||
|
||||
; bottom
|
||||
movdqa xmm1, [r3] ; last line of picture pData
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_end16x4_sse2 r4, r1, xmm1, a
|
||||
; bottom
|
||||
movdqa xmm1, [r3] ; last line of picture pData
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_16x4_sse2 r4, r1, xmm1, a
|
||||
mov_line_end16x4_sse2 r4, r1, xmm1, a
|
||||
|
||||
lea r0, [r0+16] ; top pSrc
|
||||
lea r5, [r5+16] ; top dst
|
||||
lea r3, [r3+16] ; bottom pSrc
|
||||
lea r4, [r4+16] ; bottom dst
|
||||
neg r1 ; positive/negative stride need for next loop?
|
||||
lea r0, [r0+16] ; top pSrc
|
||||
lea r5, [r5+16] ; top dst
|
||||
lea r3, [r3+16] ; bottom pSrc
|
||||
lea r4, [r4+16] ; bottom dst
|
||||
neg r1 ; positive/negative stride need for next loop?
|
||||
|
||||
dec r2
|
||||
jnz near .top_bottom_loops
|
||||
dec r2
|
||||
jnz near .top_bottom_loops
|
||||
|
||||
; for remaining 8 bytes
|
||||
and r6, 0fh ; any 8 bytes left?
|
||||
test r6, r6
|
||||
jz near .to_be_continued ; no left to exit here
|
||||
; for remaining 8 bytes
|
||||
and r6, 0fh ; any 8 bytes left?
|
||||
test r6, r6
|
||||
jz near .to_be_continued ; no left to exit here
|
||||
|
||||
; top
|
||||
movq mm0, [r0] ; remained 8 byte
|
||||
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
; bottom
|
||||
movq mm1, [r3]
|
||||
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
WELSEMMS
|
||||
; top
|
||||
movq mm0, [r0] ; remained 8 byte
|
||||
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
mov_line_end8x4_mmx r5, r1, mm0 ; dst, stride, mm?
|
||||
; bottom
|
||||
movq mm1, [r3]
|
||||
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
mov_line_8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
mov_line_end8x4_mmx r4, r1, mm1 ; dst, stride, mm?
|
||||
WELSEMMS
|
||||
|
||||
.to_be_continued:
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
||||
%macro exp_left_right_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
||||
;r6 [height]
|
||||
;r0 [pSrc+0] r5[pSrc-32] r1[stride]
|
||||
;r3 [pSrc+(w-1)] r4[pSrc+w]
|
||||
|
||||
%if %1 == 32 ; for luma
|
||||
%if %1 == 32 ; for luma
|
||||
.left_right_loops:
|
||||
; left
|
||||
movzx r2d, byte [r0] ; pixel pData for left border
|
||||
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdqa [r5], xmm0
|
||||
movdqa [r5+16], xmm0
|
||||
; left
|
||||
movzx r2d, byte [r0] ; pixel pData for left border
|
||||
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdqa [r5], xmm0
|
||||
movdqa [r5+16], xmm0
|
||||
|
||||
; right
|
||||
movzx r2d, byte [r3]
|
||||
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdqa [r4], xmm1
|
||||
movdqa [r4+16], xmm1
|
||||
; right
|
||||
movzx r2d, byte [r3]
|
||||
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdqa [r4], xmm1
|
||||
movdqa [r4+16], xmm1
|
||||
|
||||
lea r0, [r0+r1] ; left pSrc
|
||||
lea r5, [r5+r1] ; left dst
|
||||
lea r3, [r3+r1] ; right pSrc
|
||||
lea r4, [r4+r1] ; right dst
|
||||
lea r0, [r0+r1] ; left pSrc
|
||||
lea r5, [r5+r1] ; left dst
|
||||
lea r3, [r3+r1] ; right pSrc
|
||||
lea r4, [r4+r1] ; right dst
|
||||
|
||||
dec r6
|
||||
jnz near .left_right_loops
|
||||
%elif %1 == 16 ; for chroma ??
|
||||
dec r6
|
||||
jnz near .left_right_loops
|
||||
%elif %1 == 16 ; for chroma ??
|
||||
.left_right_loops:
|
||||
; left
|
||||
movzx r2d, byte [r0] ; pixel pData for left border
|
||||
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdqa [r5], xmm0
|
||||
; left
|
||||
movzx r2d, byte [r0] ; pixel pData for left border
|
||||
SSE2_Copy16Times xmm0, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdqa [r5], xmm0
|
||||
|
||||
; right
|
||||
movzx r2d, byte [r3]
|
||||
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
|
||||
; right
|
||||
movzx r2d, byte [r3]
|
||||
SSE2_Copy16Times xmm1, r2d ; dst, tmp, pSrc [generic register name: a/b/c/d]
|
||||
movdq%2 [r4], xmm1 ; might not be aligned 16 bytes in case chroma planes
|
||||
|
||||
lea r0, [r0+r1] ; left pSrc
|
||||
lea r5, [r5+r1] ; left dst
|
||||
lea r3, [r3+r1] ; right pSrc
|
||||
lea r4, [r4+r1] ; right dst
|
||||
lea r0, [r0+r1] ; left pSrc
|
||||
lea r5, [r5+r1] ; left dst
|
||||
lea r3, [r3+r1] ; right pSrc
|
||||
lea r4, [r4+r1] ; right dst
|
||||
|
||||
dec r6
|
||||
jnz near .left_right_loops
|
||||
dec r6
|
||||
jnz near .left_right_loops
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
||||
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
|
||||
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
|
||||
%macro exp_cross_sse2 2 ; iPaddingSize [luma(32)/chroma(16)], u/a
|
||||
; top-left: (x)mm3, top-right: (x)mm4, bottom-left: (x)mm5, bottom-right: (x)mm6
|
||||
; edi: TL, ebp: TR, eax: BL, ebx: BR, ecx, -stride
|
||||
;r3:TL ,r4:TR,r5:BL,r6:BR r1:-stride
|
||||
%if %1 == 32 ; luma
|
||||
; TL
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
%if %1 == 32 ; luma
|
||||
; TL
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r3, r1, xmm3 ; dst, stride, xmm?
|
||||
|
||||
; TR
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
; TR
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r4, r1, xmm4 ; dst, stride, xmm?
|
||||
|
||||
; BL
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
; BL
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r5, r1, xmm5 ; dst, stride, xmm?
|
||||
|
||||
; BR
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
%elif %1 == 16 ; chroma
|
||||
; TL
|
||||
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
; BR
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
mov_line_end32x4_sse2 r6, r1, xmm6 ; dst, stride, xmm?
|
||||
%elif %1 == 16 ; chroma
|
||||
; TL
|
||||
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r3, r1, xmm3, a ; dst, stride, xmm?
|
||||
|
||||
; TR
|
||||
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
; TR
|
||||
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r4, r1, xmm4, %2 ; dst, stride, xmm?
|
||||
|
||||
; BL
|
||||
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
; BL
|
||||
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r5, r1, xmm5, a ; dst, stride, xmm?
|
||||
|
||||
; BR
|
||||
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
; BR
|
||||
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
mov_line_16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
mov_line_end16x4_sse2 r6, r1, xmm6, %2 ; dst, stride, xmm?
|
||||
%endif
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************----------------
|
||||
; void ExpandPictureLuma_sse2( uint8_t *pDst,
|
||||
; const int32_t iStride,
|
||||
; const int32_t iWidth,
|
||||
; const int32_t iHeight );
|
||||
; void ExpandPictureLuma_sse2( uint8_t *pDst,
|
||||
; const int32_t iStride,
|
||||
; const int32_t iWidth,
|
||||
; const int32_t iHeight );
|
||||
;***********************************************************************----------------
|
||||
WELS_EXTERN ExpandPictureLuma_sse2
|
||||
|
||||
@ -403,8 +403,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
|
||||
|
||||
exp_top_bottom_sse2 32
|
||||
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
pop r2
|
||||
pop r1
|
||||
@ -416,8 +416,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
|
||||
lea r4,[r3+1] ;right border dst
|
||||
|
||||
;prepare for cross border data: top-rigth with xmm4
|
||||
movzx r6d,byte [r3] ;top -rigth
|
||||
SSE2_Copy16Times xmm4,r6d
|
||||
movzx r6d,byte [r3] ;top -rigth
|
||||
SSE2_Copy16Times xmm4,r6d
|
||||
|
||||
neg r1 ;r1 = stride
|
||||
|
||||
@ -438,8 +438,8 @@ WELS_EXTERN ExpandPictureLuma_sse2
|
||||
pop r1
|
||||
pop r0
|
||||
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
@ -472,13 +472,13 @@ WELS_EXTERN ExpandPictureLuma_sse2
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************----------------
|
||||
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
|
||||
; const int32_t iStride,
|
||||
; const int32_t iWidth,
|
||||
; const int32_t iHeight );
|
||||
; void ExpandPictureChromaAlign_sse2( uint8_t *pDst,
|
||||
; const int32_t iStride,
|
||||
; const int32_t iWidth,
|
||||
; const int32_t iHeight );
|
||||
;***********************************************************************----------------
|
||||
WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
|
||||
@ -531,8 +531,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
|
||||
exp_top_bottom_sse2 16
|
||||
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
pop r2
|
||||
pop r1
|
||||
@ -557,7 +557,7 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
push r0
|
||||
push r1
|
||||
push r2
|
||||
push r6
|
||||
push r6
|
||||
exp_left_right_sse2 16,a
|
||||
|
||||
pop r6
|
||||
@ -565,8 +565,8 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
pop r1
|
||||
pop r0
|
||||
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
@ -599,16 +599,16 @@ WELS_EXTERN ExpandPictureChromaAlign_sse2
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************----------------
|
||||
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
|
||||
; const int32_t iStride,
|
||||
; const int32_t iWidth,
|
||||
; const int32_t iHeight );
|
||||
; void ExpandPictureChromaUnalign_sse2( uint8_t *pDst,
|
||||
; const int32_t iStride,
|
||||
; const int32_t iWidth,
|
||||
; const int32_t iHeight );
|
||||
;***********************************************************************----------------
|
||||
WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
push r4
|
||||
push r4
|
||||
push r5
|
||||
push r6
|
||||
|
||||
@ -657,8 +657,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
|
||||
exp_top_bottom_sse2 16
|
||||
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; for both left and right border
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
pop r2
|
||||
pop r1
|
||||
@ -683,7 +683,7 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
push r0
|
||||
push r1
|
||||
push r2
|
||||
push r6
|
||||
push r6
|
||||
exp_left_right_sse2 16,u
|
||||
|
||||
pop r6
|
||||
@ -691,8 +691,8 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
pop r1
|
||||
pop r0
|
||||
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; for cross border [top-left, top-right, bottom-left, bottom-right]
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
; have done xmm3,..,xmm6 cross pData initialization above, perform pading as below, To be continued..
|
||||
|
||||
neg r1 ;r1 = -stride
|
||||
@ -725,4 +725,4 @@ WELS_EXTERN ExpandPictureChromaUnalign_sse2
|
||||
%assign push_num 0
|
||||
|
||||
|
||||
ret
|
||||
ret
|
||||
|
@ -36,9 +36,9 @@
|
||||
;*
|
||||
;* History
|
||||
;* 15/09/2009 Created
|
||||
;* 12/28/2009 Modified with larger throughput
|
||||
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
|
||||
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
|
||||
;* 12/28/2009 Modified with larger throughput
|
||||
;* 12/29/2011 Tuned WelsCopy16x16NotAligned_sse2, added UpdateMbMv_sse2 WelsCopy16x8NotAligned_sse2,
|
||||
;* WelsCopy16x8_mmx, WelsCopy8x16_mmx etc;
|
||||
;*
|
||||
;*
|
||||
;*********************************************************************************************/
|
||||
@ -56,174 +56,174 @@ SECTION .text
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsCopy16x16_sse2( uint8_t* Dst,
|
||||
; int32_t iStrideD,
|
||||
; uint8_t* Src,
|
||||
; int32_t iStrideS )
|
||||
; void WelsCopy16x16_sse2( uint8_t* Dst,
|
||||
; int32_t iStrideD,
|
||||
; uint8_t* Src,
|
||||
; int32_t iStrideS )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsCopy16x16_sse2
|
||||
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+2*r3]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+2*r3]
|
||||
movdqa xmm7, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+2*r3]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+2*r3]
|
||||
movdqa xmm7, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
lea r0, [r0+4*r1]
|
||||
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+2*r3]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+2*r3]
|
||||
movdqa xmm7, [r2+r5]
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+2*r3]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+2*r3]
|
||||
movdqa xmm7, [r2+r5]
|
||||
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
|
||||
; int32_t iStrideD,
|
||||
; uint8_t* Src,
|
||||
; int32_t iStrideS )
|
||||
; void WelsCopy16x16NotAligned_sse2( uint8_t* Dst,
|
||||
; int32_t iStrideD,
|
||||
; uint8_t* Src,
|
||||
; int32_t iStrideS )
|
||||
;***********************************************************************
|
||||
; dst can be align with 16 bytes, but not sure about pSrc, 12/29/2011
|
||||
WELS_EXTERN WelsCopy16x16NotAligned_sse2
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r2+r3]
|
||||
movdqu xmm2, [r2+2*r3]
|
||||
movdqu xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm4, [r2]
|
||||
movdqu xmm5, [r2+r3]
|
||||
movdqu xmm6, [r2+2*r3]
|
||||
movdqu xmm7, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r2+r3]
|
||||
movdqu xmm2, [r2+2*r3]
|
||||
movdqu xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm4, [r2]
|
||||
movdqu xmm5, [r2+r3]
|
||||
movdqu xmm6, [r2+2*r3]
|
||||
movdqu xmm7, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
lea r0, [r0+4*r1]
|
||||
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r2+r3]
|
||||
movdqu xmm2, [r2+2*r3]
|
||||
movdqu xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm4, [r2]
|
||||
movdqu xmm5, [r2+r3]
|
||||
movdqu xmm6, [r2+2*r3]
|
||||
movdqu xmm7, [r2+r5]
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r2+r3]
|
||||
movdqu xmm2, [r2+2*r3]
|
||||
movdqu xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm4, [r2]
|
||||
movdqu xmm5, [r2+r3]
|
||||
movdqu xmm6, [r2+2*r3]
|
||||
movdqu xmm7, [r2+r5]
|
||||
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
|
||||
; , 12/29/2011
|
||||
;***********************************************************************
|
||||
; void WelsCopy16x8NotAligned_sse2(uint8_t* Dst,
|
||||
; int32_t iStrideD,
|
||||
; uint8_t* Src,
|
||||
; int32_t iStrideS )
|
||||
; int32_t iStrideD,
|
||||
; uint8_t* Src,
|
||||
; int32_t iStrideS )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsCopy16x8NotAligned_sse2
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
lea r4, [r1+2*r1] ;ebx, [eax+2*eax] ; x3
|
||||
lea r5, [r3+2*r3] ;edx, [ecx+2*ecx] ; x3
|
||||
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r2+r3]
|
||||
movdqu xmm2, [r2+2*r3]
|
||||
movdqu xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm4, [r2]
|
||||
movdqu xmm5, [r2+r3]
|
||||
movdqu xmm6, [r2+2*r3]
|
||||
movdqu xmm7, [r2+r5]
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r2+r3]
|
||||
movdqu xmm2, [r2+2*r3]
|
||||
movdqu xmm3, [r2+r5]
|
||||
lea r2, [r2+4*r3]
|
||||
movdqu xmm4, [r2]
|
||||
movdqu xmm5, [r2+r3]
|
||||
movdqu xmm6, [r2+2*r3]
|
||||
movdqu xmm7, [r2+r5]
|
||||
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+r1], xmm1
|
||||
movdqa [r0+2*r1], xmm2
|
||||
movdqa [r0+r4], xmm3
|
||||
lea r0, [r0+4*r1]
|
||||
movdqa [r0], xmm4
|
||||
movdqa [r0+r1], xmm5
|
||||
movdqa [r0+2*r1], xmm6
|
||||
movdqa [r0+r4], xmm7
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
@ -233,62 +233,62 @@ WELS_EXTERN WelsCopy16x8NotAligned_sse2
|
||||
; int32_t iStrideS )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsCopy8x16_mmx
|
||||
%assign push_num 0
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm7, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm7, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
|
||||
movq [r0], mm0
|
||||
movq [r0+r1], mm1
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm2
|
||||
movq [r0+r1], mm3
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm4
|
||||
movq [r0+r1], mm5
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm6
|
||||
movq [r0+r1], mm7
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm0
|
||||
movq [r0+r1], mm1
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm2
|
||||
movq [r0+r1], mm3
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm4
|
||||
movq [r0+r1], mm5
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm6
|
||||
movq [r0+r1], mm7
|
||||
lea r0, [r0+2*r1]
|
||||
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm7, [r2+r3]
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm7, [r2+r3]
|
||||
|
||||
movq [r0], mm0
|
||||
movq [r0+r1], mm1
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm2
|
||||
movq [r0+r1], mm3
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm4
|
||||
movq [r0+r1], mm5
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm6
|
||||
movq [r0+r1], mm7
|
||||
movq [r0], mm0
|
||||
movq [r0+r1], mm1
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm2
|
||||
movq [r0+r1], mm3
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm4
|
||||
movq [r0+r1], mm5
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm6
|
||||
movq [r0+r1], mm7
|
||||
|
||||
WELSEMMS
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
WELSEMMS
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsCopy8x8_mmx( uint8_t* Dst,
|
||||
@ -297,48 +297,48 @@ WELS_EXTERN WelsCopy8x16_mmx
|
||||
; int32_t iStrideS )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsCopy8x8_mmx
|
||||
push r4
|
||||
%assign push_num 1
|
||||
push r4
|
||||
%assign push_num 1
|
||||
LOAD_4_PARA
|
||||
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
|
||||
lea r4, [r3+2*r3] ;edx, [ebx+2*ebx]
|
||||
|
||||
; to prefetch next loop
|
||||
prefetchnta [r2+2*r3]
|
||||
prefetchnta [r2+r4]
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
; to prefetch next loop
|
||||
prefetchnta [r2+2*r3]
|
||||
prefetchnta [r2+r4]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
; to prefetch next loop
|
||||
prefetchnta [r2+2*r3]
|
||||
prefetchnta [r2+r4]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm7, [r2+r3]
|
||||
; to prefetch next loop
|
||||
prefetchnta [r2+2*r3]
|
||||
prefetchnta [r2+r4]
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
; to prefetch next loop
|
||||
prefetchnta [r2+2*r3]
|
||||
prefetchnta [r2+r4]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
; to prefetch next loop
|
||||
prefetchnta [r2+2*r3]
|
||||
prefetchnta [r2+r4]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm7, [r2+r3]
|
||||
|
||||
movq [r0], mm0
|
||||
movq [r0+r1], mm1
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm2
|
||||
movq [r0+r1], mm3
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm4
|
||||
movq [r0+r1], mm5
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm6
|
||||
movq [r0+r1], mm7
|
||||
movq [r0], mm0
|
||||
movq [r0+r1], mm1
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm2
|
||||
movq [r0+r1], mm3
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm4
|
||||
movq [r0+r1], mm5
|
||||
lea r0, [r0+2*r1]
|
||||
movq [r0], mm6
|
||||
movq [r0+r1], mm7
|
||||
|
||||
WELSEMMS
|
||||
LOAD_4_PARA_POP
|
||||
pop r4
|
||||
ret
|
||||
WELSEMMS
|
||||
LOAD_4_PARA_POP
|
||||
pop r4
|
||||
ret
|
||||
|
||||
; (dunhuang@cisco), 12/21/2011
|
||||
;***********************************************************************
|
||||
@ -349,13 +349,13 @@ WELS_EXTERN UpdateMbMv_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
|
||||
movd xmm0, r1d ; _mv
|
||||
pshufd xmm1, xmm0, $00
|
||||
movdqa [r0 ], xmm1
|
||||
movdqa [r0+0x10], xmm1
|
||||
movdqa [r0+0x20], xmm1
|
||||
movdqa [r0+0x30], xmm1
|
||||
ret
|
||||
movd xmm0, r1d ; _mv
|
||||
pshufd xmm1, xmm0, $00
|
||||
movdqa [r0 ], xmm1
|
||||
movdqa [r0+0x10], xmm1
|
||||
movdqa [r0+0x20], xmm1
|
||||
movdqa [r0+0x30], xmm1
|
||||
ret
|
||||
|
||||
;*******************************************************************************
|
||||
; Macros and other preprocessor constants
|
||||
@ -381,14 +381,14 @@ WELS_EXTERN PixelAvgWidthEq4_mmx
|
||||
%assign push_num 0
|
||||
LOAD_7_PARA
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
SIGN_EXTENSION r6, r6d
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
SIGN_EXTENSION r6, r6d
|
||||
|
||||
ALIGN 4
|
||||
.height_loop:
|
||||
movd mm0, [r4]
|
||||
movd mm0, [r4]
|
||||
pavgb mm0, [r2]
|
||||
movd [r0], mm0
|
||||
|
||||
@ -398,8 +398,8 @@ ALIGN 4
|
||||
lea r4, [r4+r5]
|
||||
jne .height_loop
|
||||
|
||||
WELSEMMS
|
||||
LOAD_7_PARA_POP
|
||||
WELSEMMS
|
||||
LOAD_7_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
@ -413,29 +413,29 @@ WELS_EXTERN PixelAvgWidthEq8_mmx
|
||||
%assign push_num 0
|
||||
LOAD_7_PARA
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
SIGN_EXTENSION r6, r6d
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
SIGN_EXTENSION r6, r6d
|
||||
|
||||
ALIGN 4
|
||||
.height_loop:
|
||||
movq mm0, [r2]
|
||||
movq mm0, [r2]
|
||||
pavgb mm0, [r4]
|
||||
movq [r0], mm0
|
||||
movq mm0, [r2+r3]
|
||||
pavgb mm0, [r4+r5]
|
||||
movq [r0+r1], mm0
|
||||
movq [r0+r1], mm0
|
||||
|
||||
lea r2, [r2+2*r3]
|
||||
lea r4, [r4+2*r5]
|
||||
lea r0, [r0+2*r1]
|
||||
lea r2, [r2+2*r3]
|
||||
lea r4, [r4+2*r5]
|
||||
lea r0, [r0+2*r1]
|
||||
|
||||
sub r6, 2
|
||||
jnz .height_loop
|
||||
|
||||
WELSEMMS
|
||||
LOAD_7_PARA_POP
|
||||
WELSEMMS
|
||||
LOAD_7_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
@ -450,46 +450,46 @@ WELS_EXTERN PixelAvgWidthEq16_sse2
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_7_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
SIGN_EXTENSION r6, r6d
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
SIGN_EXTENSION r6, r6d
|
||||
ALIGN 4
|
||||
.height_loop:
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r4]
|
||||
pavgb xmm0, xmm1
|
||||
;pavgb xmm0, [r4]
|
||||
movdqu xmm0, [r2]
|
||||
movdqu xmm1, [r4]
|
||||
pavgb xmm0, xmm1
|
||||
;pavgb xmm0, [r4]
|
||||
movdqu [r0], xmm0
|
||||
|
||||
movdqu xmm0, [r2+r3]
|
||||
movdqu xmm1, [r4+r5]
|
||||
pavgb xmm0, xmm1
|
||||
movdqu xmm0, [r2+r3]
|
||||
movdqu xmm1, [r4+r5]
|
||||
pavgb xmm0, xmm1
|
||||
movdqu [r0+r1], xmm0
|
||||
|
||||
movdqu xmm0, [r2+2*r3]
|
||||
movdqu xmm1, [r4+2*r5]
|
||||
pavgb xmm0, xmm1
|
||||
movdqu xmm0, [r2+2*r3]
|
||||
movdqu xmm1, [r4+2*r5]
|
||||
pavgb xmm0, xmm1
|
||||
movdqu [r0+2*r1], xmm0
|
||||
|
||||
lea r2, [r2+2*r3]
|
||||
lea r4, [r4+2*r5]
|
||||
lea r0, [r0+2*r1]
|
||||
lea r4, [r4+2*r5]
|
||||
lea r0, [r0+2*r1]
|
||||
|
||||
movdqu xmm0, [r2+r3]
|
||||
movdqu xmm1, [r4+r5]
|
||||
pavgb xmm0, xmm1
|
||||
movdqu xmm0, [r2+r3]
|
||||
movdqu xmm1, [r4+r5]
|
||||
pavgb xmm0, xmm1
|
||||
movdqu [r0+r1], xmm0
|
||||
|
||||
lea r2, [r2+2*r3]
|
||||
lea r4, [r4+2*r5]
|
||||
lea r0, [r0+2*r1]
|
||||
lea r4, [r4+2*r5]
|
||||
lea r0, [r0+2*r1]
|
||||
|
||||
sub r6, 4
|
||||
jne .height_loop
|
||||
|
||||
WELSEMMS
|
||||
LOAD_7_PARA_POP
|
||||
WELSEMMS
|
||||
LOAD_7_PARA_POP
|
||||
ret
|
||||
|
||||
;*******************************************************************************
|
||||
@ -497,26 +497,26 @@ ALIGN 4
|
||||
; uint8_t *pDst, int iDstStride, int iHeight )
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McCopyWidthEq4_mmx
|
||||
push r5
|
||||
push r5
|
||||
%assign push_num 1
|
||||
LOAD_5_PARA
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
|
||||
ALIGN 4
|
||||
.height_loop:
|
||||
mov r5d, [r0]
|
||||
mov [r2], r5d
|
||||
mov r5d, [r0]
|
||||
mov [r2], r5d
|
||||
|
||||
add r0, r1
|
||||
add r2, r3
|
||||
dec r4
|
||||
jnz .height_loop
|
||||
WELSEMMS
|
||||
add r0, r1
|
||||
add r2, r3
|
||||
dec r4
|
||||
jnz .height_loop
|
||||
WELSEMMS
|
||||
LOAD_5_PARA_POP
|
||||
pop r5
|
||||
pop r5
|
||||
ret
|
||||
|
||||
;*******************************************************************************
|
||||
@ -527,21 +527,21 @@ WELS_EXTERN McCopyWidthEq8_mmx
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
|
||||
ALIGN 4
|
||||
.height_loop:
|
||||
movq mm0, [r0]
|
||||
movq [r2], mm0
|
||||
add r0, r1
|
||||
add r2, r3
|
||||
dec r4
|
||||
jnz .height_loop
|
||||
movq mm0, [r0]
|
||||
movq [r2], mm0
|
||||
add r0, r1
|
||||
add r2, r3
|
||||
dec r4
|
||||
jnz .height_loop
|
||||
|
||||
WELSEMMS
|
||||
LOAD_5_PARA_POP
|
||||
WELSEMMS
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
@ -550,32 +550,32 @@ ALIGN 4
|
||||
;*******************************************************************************
|
||||
;read unaligned memory
|
||||
%macro SSE_READ_UNA 2
|
||||
movq %1, [%2]
|
||||
movhps %1, [%2+8]
|
||||
movq %1, [%2]
|
||||
movhps %1, [%2+8]
|
||||
%endmacro
|
||||
|
||||
;write unaligned memory
|
||||
%macro SSE_WRITE_UNA 2
|
||||
movq [%1], %2
|
||||
movhps [%1+8], %2
|
||||
movq [%1], %2
|
||||
movhps [%1+8], %2
|
||||
%endmacro
|
||||
WELS_EXTERN McCopyWidthEq16_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
ALIGN 4
|
||||
.height_loop:
|
||||
SSE_READ_UNA xmm0, r0
|
||||
SSE_READ_UNA xmm1, r0+r1
|
||||
SSE_WRITE_UNA r2, xmm0
|
||||
SSE_WRITE_UNA r2+r3, xmm1
|
||||
SSE_READ_UNA xmm0, r0
|
||||
SSE_READ_UNA xmm1, r0+r1
|
||||
SSE_WRITE_UNA r2, xmm0
|
||||
SSE_WRITE_UNA r2+r3, xmm1
|
||||
|
||||
sub r4, 2
|
||||
sub r4, 2
|
||||
lea r0, [r0+r1*2]
|
||||
lea r2, [r2+r3*2]
|
||||
jnz .height_loop
|
||||
|
||||
LOAD_5_PARA_POP
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
@ -53,10 +53,10 @@ SECTION .rodata align=16
|
||||
|
||||
ALIGN 16
|
||||
h264_d0x20_sse2:
|
||||
dw 32,32,32,32,32,32,32,32
|
||||
dw 32,32,32,32,32,32,32,32
|
||||
ALIGN 16
|
||||
h264_d0x20_mmx:
|
||||
dw 32,32,32,32
|
||||
dw 32,32,32,32
|
||||
|
||||
|
||||
;=============================================================================
|
||||
@ -67,171 +67,171 @@ SECTION .text
|
||||
|
||||
;*******************************************************************************
|
||||
; void McChromaWidthEq4_mmx( const uint8_t *src,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; const uint8_t *pABCD,
|
||||
; int32_t iHeigh );
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; const uint8_t *pABCD,
|
||||
; int32_t iHeigh );
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McChromaWidthEq4_mmx
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
movd mm3, [r4]; [eax]
|
||||
WELS_Zero mm7
|
||||
punpcklbw mm3, mm3
|
||||
movq mm4, mm3
|
||||
punpcklwd mm3, mm3
|
||||
punpckhwd mm4, mm4
|
||||
movd mm3, [r4]; [eax]
|
||||
WELS_Zero mm7
|
||||
punpcklbw mm3, mm3
|
||||
movq mm4, mm3
|
||||
punpcklwd mm3, mm3
|
||||
punpckhwd mm4, mm4
|
||||
|
||||
movq mm5, mm3
|
||||
punpcklbw mm3, mm7
|
||||
punpckhbw mm5, mm7
|
||||
movq mm5, mm3
|
||||
punpcklbw mm3, mm7
|
||||
punpckhbw mm5, mm7
|
||||
|
||||
movq mm6, mm4
|
||||
punpcklbw mm4, mm7
|
||||
punpckhbw mm6, mm7
|
||||
movq mm6, mm4
|
||||
punpcklbw mm4, mm7
|
||||
punpckhbw mm6, mm7
|
||||
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0+1]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0+1]
|
||||
punpcklbw mm0, mm7
|
||||
punpcklbw mm1, mm7
|
||||
.xloop:
|
||||
|
||||
pmullw mm0, mm3
|
||||
pmullw mm1, mm5
|
||||
paddw mm0, mm1
|
||||
pmullw mm0, mm3
|
||||
pmullw mm1, mm5
|
||||
paddw mm0, mm1
|
||||
|
||||
movd mm1, [r4]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm2, mm1
|
||||
pmullw mm1, mm4
|
||||
paddw mm0, mm1
|
||||
movd mm1, [r4]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm2, mm1
|
||||
pmullw mm1, mm4
|
||||
paddw mm0, mm1
|
||||
|
||||
movd mm1, [r4+1]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm7, mm1
|
||||
pmullw mm1,mm6
|
||||
paddw mm0, mm1
|
||||
movq mm1,mm7
|
||||
movd mm1, [r4+1]
|
||||
punpcklbw mm1, mm7
|
||||
movq mm7, mm1
|
||||
pmullw mm1,mm6
|
||||
paddw mm0, mm1
|
||||
movq mm1,mm7
|
||||
|
||||
paddw mm0, [h264_d0x20_mmx]
|
||||
psrlw mm0, 6
|
||||
paddw mm0, [h264_d0x20_mmx]
|
||||
psrlw mm0, 6
|
||||
|
||||
WELS_Zero mm7
|
||||
packuswb mm0, mm7
|
||||
movd [r2], mm0
|
||||
WELS_Zero mm7
|
||||
packuswb mm0, mm7
|
||||
movd [r2], mm0
|
||||
|
||||
movq mm0, mm2
|
||||
movq mm0, mm2
|
||||
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
WELSEMMS
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
WELSEMMS
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
;*******************************************************************************
|
||||
; void McChromaWidthEq8_sse2( const uint8_t *pSrc,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; const uint8_t *pABCD,
|
||||
; int32_t iheigh );
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; const uint8_t *pABCD,
|
||||
; int32_t iheigh );
|
||||
;*******************************************************************************
|
||||
WELS_EXTERN McChromaWidthEq8_sse2
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
movd xmm3, [r4]
|
||||
WELS_Zero xmm7
|
||||
punpcklbw xmm3, xmm3
|
||||
punpcklwd xmm3, xmm3
|
||||
movd xmm3, [r4]
|
||||
WELS_Zero xmm7
|
||||
punpcklbw xmm3, xmm3
|
||||
punpcklwd xmm3, xmm3
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
punpckldq xmm3, xmm3
|
||||
punpckhdq xmm4, xmm4
|
||||
movdqa xmm5, xmm3
|
||||
movdqa xmm6, xmm4
|
||||
movdqa xmm4, xmm3
|
||||
punpckldq xmm3, xmm3
|
||||
punpckhdq xmm4, xmm4
|
||||
movdqa xmm5, xmm3
|
||||
movdqa xmm6, xmm4
|
||||
|
||||
punpcklbw xmm3, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
punpcklbw xmm4, xmm7
|
||||
punpckhbw xmm6, xmm7
|
||||
punpcklbw xmm3, xmm7
|
||||
punpckhbw xmm5, xmm7
|
||||
punpcklbw xmm4, xmm7
|
||||
punpckhbw xmm6, xmm7
|
||||
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movq xmm0, [r0]
|
||||
movq xmm1, [r0+1]
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
lea r4, [r0 + r1] ;lea ebx, [esi + eax]
|
||||
movq xmm0, [r0]
|
||||
movq xmm1, [r0+1]
|
||||
punpcklbw xmm0, xmm7
|
||||
punpcklbw xmm1, xmm7
|
||||
.xloop:
|
||||
|
||||
pmullw xmm0, xmm3
|
||||
pmullw xmm1, xmm5
|
||||
paddw xmm0, xmm1
|
||||
pmullw xmm0, xmm3
|
||||
pmullw xmm1, xmm5
|
||||
paddw xmm0, xmm1
|
||||
|
||||
movq xmm1, [r4]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm2, xmm1
|
||||
pmullw xmm1, xmm4
|
||||
paddw xmm0, xmm1
|
||||
movq xmm1, [r4]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm2, xmm1
|
||||
pmullw xmm1, xmm4
|
||||
paddw xmm0, xmm1
|
||||
|
||||
movq xmm1, [r4+1]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm7, xmm1
|
||||
pmullw xmm1, xmm6
|
||||
paddw xmm0, xmm1
|
||||
movdqa xmm1,xmm7
|
||||
movq xmm1, [r4+1]
|
||||
punpcklbw xmm1, xmm7
|
||||
movdqa xmm7, xmm1
|
||||
pmullw xmm1, xmm6
|
||||
paddw xmm0, xmm1
|
||||
movdqa xmm1,xmm7
|
||||
|
||||
paddw xmm0, [h264_d0x20_sse2]
|
||||
psrlw xmm0, 6
|
||||
paddw xmm0, [h264_d0x20_sse2]
|
||||
psrlw xmm0, 6
|
||||
|
||||
WELS_Zero xmm7
|
||||
packuswb xmm0, xmm7
|
||||
movq [r2], xmm0
|
||||
WELS_Zero xmm7
|
||||
packuswb xmm0, xmm7
|
||||
movq [r2], xmm0
|
||||
|
||||
movdqa xmm0, xmm2
|
||||
movdqa xmm0, xmm2
|
||||
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
lea r2, [r2 + r3]
|
||||
lea r4, [r4 + r1]
|
||||
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
dec r5
|
||||
jnz near .xloop
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
; void McChromaWidthEq8_ssse3( const uint8_t *pSrc,
|
||||
; int32_t iSrcStride,
|
||||
; int32_t iSrcStride,
|
||||
; uint8_t *pDst,
|
||||
; int32_t iDstStride,
|
||||
; const uint8_t *pABCD,
|
||||
; int32_t iHeigh);
|
||||
; int32_t iHeigh);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r5, r5d
|
||||
|
||||
pxor xmm7, xmm7
|
||||
movd xmm5, [r4]
|
||||
@ -243,27 +243,27 @@ WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
|
||||
sub r2, r3 ;sub esi, edi
|
||||
sub r2, r3
|
||||
movdqa xmm7, [h264_d0x20_sse2]
|
||||
movdqa xmm7, [h264_d0x20_sse2]
|
||||
|
||||
movdqu xmm0, [r0]
|
||||
movdqa xmm1, xmm0
|
||||
psrldq xmm1, 1
|
||||
punpcklbw xmm0, xmm1
|
||||
movdqu xmm0, [r0]
|
||||
movdqa xmm1, xmm0
|
||||
psrldq xmm1, 1
|
||||
punpcklbw xmm0, xmm1
|
||||
|
||||
.hloop_chroma:
|
||||
lea r2, [r2+2*r3]
|
||||
lea r2, [r2+2*r3]
|
||||
|
||||
movdqu xmm2, [r0+r1]
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 1
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm4, xmm2
|
||||
movdqu xmm2, [r0+r1]
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 1
|
||||
punpcklbw xmm2, xmm3
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
pmaddubsw xmm0, xmm5
|
||||
pmaddubsw xmm2, xmm6
|
||||
paddw xmm0, xmm2
|
||||
paddw xmm0, xmm7
|
||||
psrlw xmm0, 6
|
||||
psrlw xmm0, 6
|
||||
packuswb xmm0, xmm0
|
||||
movq [r2],xmm0
|
||||
|
||||
@ -278,16 +278,16 @@ WELS_EXTERN McChromaWidthEq8_ssse3
|
||||
pmaddubsw xmm2, xmm6
|
||||
paddw xmm4, xmm2
|
||||
paddw xmm4, xmm7
|
||||
psrlw xmm4, 6
|
||||
psrlw xmm4, 6
|
||||
packuswb xmm4, xmm4
|
||||
movq [r2+r3],xmm4
|
||||
|
||||
sub r5, 2
|
||||
jnz .hloop_chroma
|
||||
sub r5, 2
|
||||
jnz .hloop_chroma
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -29,16 +29,16 @@
|
||||
;* POSSIBILITY OF SUCH DAMAGE.
|
||||
;*
|
||||
;*
|
||||
;* vaa.asm
|
||||
;* vaa.asm
|
||||
;*
|
||||
;* Abstract
|
||||
;* Abstract
|
||||
;* sse2 for pVaa routines
|
||||
;*
|
||||
;* History
|
||||
;* 04/14/2010 Created
|
||||
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
|
||||
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
|
||||
;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
|
||||
;* 04/14/2010 Created
|
||||
;* 06/07/2010 Added AnalysisVaaInfoIntra_sse2(ssse3)
|
||||
;* 06/10/2010 Tune rc_sad_frame_sse2 and got about 40% improvement
|
||||
;* 08/11/2010 Added abs_difference_mbrow_sse2 & sum_sqrsum_mbrow_sse2
|
||||
;*
|
||||
;*************************************************************************/
|
||||
%include "asm_inc.asm"
|
||||
@ -49,87 +49,87 @@
|
||||
;***********************************************************************
|
||||
|
||||
; by comparing it outperforms than phaddw(SSSE3) sets
|
||||
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
|
||||
; @sum_8x2 begin
|
||||
pshufd %2, %1, 04Eh ; 01001110 B
|
||||
paddw %1, %2
|
||||
pshuflw %2, %1, 04Eh ; 01001110 B
|
||||
paddw %1, %2
|
||||
pshuflw %2, %1, 0B1h ; 10110001 B
|
||||
paddw %1, %2
|
||||
; end of @sum_8x2
|
||||
%endmacro ; END of SUM_WORD_8x2_SSE2
|
||||
%macro SUM_WORD_8x2_SSE2 2 ; dst(pSrc), tmp
|
||||
; @sum_8x2 begin
|
||||
pshufd %2, %1, 04Eh ; 01001110 B
|
||||
paddw %1, %2
|
||||
pshuflw %2, %1, 04Eh ; 01001110 B
|
||||
paddw %1, %2
|
||||
pshuflw %2, %1, 0B1h ; 10110001 B
|
||||
paddw %1, %2
|
||||
; end of @sum_8x2
|
||||
%endmacro ; END of SUM_WORD_8x2_SSE2
|
||||
|
||||
|
||||
%macro VAA_AVG_BLOCK_SSE2 6 ; dst, t0, t1, t2, t3, t4
|
||||
movdqa %1, [r0 ] ; line 0
|
||||
movdqa %2, [r0+r1] ; line 1
|
||||
movdqa %3, %1
|
||||
punpcklbw %1, xmm7
|
||||
punpckhbw %3, xmm7
|
||||
movdqa %4, %2
|
||||
punpcklbw %4, xmm7
|
||||
punpckhbw %2, xmm7
|
||||
paddw %1, %4
|
||||
paddw %2, %3
|
||||
movdqa %3, [r0+r2] ; line 2
|
||||
movdqa %4, [r0+r3] ; line 3
|
||||
movdqa %5, %3
|
||||
punpcklbw %3, xmm7
|
||||
punpckhbw %5, xmm7
|
||||
movdqa %6, %4
|
||||
punpcklbw %6, xmm7
|
||||
punpckhbw %4, xmm7
|
||||
paddw %3, %6
|
||||
paddw %4, %5
|
||||
paddw %1, %3 ; block 0, 1
|
||||
paddw %2, %4 ; block 2, 3
|
||||
pshufd %3, %1, 0B1h
|
||||
pshufd %4, %2, 0B1h
|
||||
paddw %1, %3
|
||||
paddw %2, %4
|
||||
movdqa %3, %1
|
||||
movdqa %4, %2
|
||||
pshuflw %5, %1, 0B1h
|
||||
pshufhw %6, %3, 0B1h
|
||||
paddw %1, %5
|
||||
paddw %3, %6
|
||||
pshuflw %5, %2, 0B1h
|
||||
pshufhw %6, %4, 0B1h
|
||||
paddw %2, %5
|
||||
paddw %4, %6
|
||||
punpcklwd %1, %2
|
||||
punpckhwd %3, %4
|
||||
punpcklwd %1, %3
|
||||
psraw %1, $04
|
||||
movdqa %1, [r0 ] ; line 0
|
||||
movdqa %2, [r0+r1] ; line 1
|
||||
movdqa %3, %1
|
||||
punpcklbw %1, xmm7
|
||||
punpckhbw %3, xmm7
|
||||
movdqa %4, %2
|
||||
punpcklbw %4, xmm7
|
||||
punpckhbw %2, xmm7
|
||||
paddw %1, %4
|
||||
paddw %2, %3
|
||||
movdqa %3, [r0+r2] ; line 2
|
||||
movdqa %4, [r0+r3] ; line 3
|
||||
movdqa %5, %3
|
||||
punpcklbw %3, xmm7
|
||||
punpckhbw %5, xmm7
|
||||
movdqa %6, %4
|
||||
punpcklbw %6, xmm7
|
||||
punpckhbw %4, xmm7
|
||||
paddw %3, %6
|
||||
paddw %4, %5
|
||||
paddw %1, %3 ; block 0, 1
|
||||
paddw %2, %4 ; block 2, 3
|
||||
pshufd %3, %1, 0B1h
|
||||
pshufd %4, %2, 0B1h
|
||||
paddw %1, %3
|
||||
paddw %2, %4
|
||||
movdqa %3, %1
|
||||
movdqa %4, %2
|
||||
pshuflw %5, %1, 0B1h
|
||||
pshufhw %6, %3, 0B1h
|
||||
paddw %1, %5
|
||||
paddw %3, %6
|
||||
pshuflw %5, %2, 0B1h
|
||||
pshufhw %6, %4, 0B1h
|
||||
paddw %2, %5
|
||||
paddw %4, %6
|
||||
punpcklwd %1, %2
|
||||
punpckhwd %3, %4
|
||||
punpcklwd %1, %3
|
||||
psraw %1, $04
|
||||
%endmacro
|
||||
|
||||
%macro VAA_AVG_BLOCK_SSSE3 6 ; dst, t0, t1, t2, t3, t4
|
||||
movdqa %1, [r0 ] ; line 0
|
||||
movdqa %2, [r0+r1] ; line 1
|
||||
movdqa %3, %1
|
||||
punpcklbw %1, xmm7
|
||||
punpckhbw %3, xmm7
|
||||
movdqa %4, %2
|
||||
punpcklbw %4, xmm7
|
||||
punpckhbw %2, xmm7
|
||||
paddw %1, %4
|
||||
paddw %2, %3
|
||||
movdqa %3, [r0+r2] ; line 2
|
||||
movdqa %4, [r0+r3] ; line 3
|
||||
movdqa %5, %3
|
||||
punpcklbw %3, xmm7
|
||||
punpckhbw %5, xmm7
|
||||
movdqa %6, %4
|
||||
punpcklbw %6, xmm7
|
||||
punpckhbw %4, xmm7
|
||||
paddw %3, %6
|
||||
paddw %4, %5
|
||||
paddw %1, %3 ; block 0, 1
|
||||
paddw %2, %4 ; block 2, 3
|
||||
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
|
||||
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
|
||||
psraw %1, $04
|
||||
movdqa %1, [r0 ] ; line 0
|
||||
movdqa %2, [r0+r1] ; line 1
|
||||
movdqa %3, %1
|
||||
punpcklbw %1, xmm7
|
||||
punpckhbw %3, xmm7
|
||||
movdqa %4, %2
|
||||
punpcklbw %4, xmm7
|
||||
punpckhbw %2, xmm7
|
||||
paddw %1, %4
|
||||
paddw %2, %3
|
||||
movdqa %3, [r0+r2] ; line 2
|
||||
movdqa %4, [r0+r3] ; line 3
|
||||
movdqa %5, %3
|
||||
punpcklbw %3, xmm7
|
||||
punpckhbw %5, xmm7
|
||||
movdqa %6, %4
|
||||
punpcklbw %6, xmm7
|
||||
punpckhbw %4, xmm7
|
||||
paddw %3, %6
|
||||
paddw %4, %5
|
||||
paddw %1, %3 ; block 0, 1
|
||||
paddw %2, %4 ; block 2, 3
|
||||
phaddw %1, %2 ; block[0]: 0-15, 16-31; block[1]: 32-47, 48-63; ..
|
||||
phaddw %1, xmm7 ; block[0]: 0-15; block[1]: 16-31; block[2]: 32-47; block[3]: 48-63; ....
|
||||
psraw %1, $04
|
||||
%endmacro
|
||||
|
||||
|
||||
@ -143,7 +143,7 @@ SECTION .text
|
||||
; , 6/7/2010
|
||||
|
||||
;***********************************************************************
|
||||
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
|
||||
; int32_t AnalysisVaaInfoIntra_sse2( uint8_t *pDataY, const int32_t iLineSize );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN AnalysisVaaInfoIntra_sse2
|
||||
|
||||
@ -174,71 +174,71 @@ WELS_EXTERN AnalysisVaaInfoIntra_sse2
|
||||
mov r4,r2
|
||||
sal r4,$01 ;r4 = 4*iLineSize
|
||||
|
||||
pxor xmm7, xmm7
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; loops
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7], xmm0
|
||||
; loops
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7], xmm0
|
||||
|
||||
lea r0, [r0+r4]
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+8], xmm0
|
||||
lea r0, [r0+r4]
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+8], xmm0
|
||||
|
||||
lea r0, [r0+r4]
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+16], xmm0
|
||||
lea r0, [r0+r4]
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+16], xmm0
|
||||
|
||||
lea r0, [r0+r4]
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+24], xmm0
|
||||
lea r0, [r0+r4]
|
||||
VAA_AVG_BLOCK_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+24], xmm0
|
||||
|
||||
movdqa xmm0, [r7] ; block 0~7
|
||||
movdqa xmm1, [r7+16] ; block 8~15
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
SUM_WORD_8x2_SSE2 xmm0, xmm3
|
||||
movdqa xmm0, [r7] ; block 0~7
|
||||
movdqa xmm1, [r7+16] ; block 8~15
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
SUM_WORD_8x2_SSE2 xmm0, xmm3
|
||||
|
||||
pmullw xmm1, xmm1
|
||||
pmullw xmm2, xmm2
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
punpcklwd xmm1, xmm7
|
||||
punpckhwd xmm3, xmm7
|
||||
punpcklwd xmm2, xmm7
|
||||
punpckhwd xmm4, xmm7
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm3, xmm4
|
||||
paddd xmm1, xmm3
|
||||
pshufd xmm2, xmm1, 01Bh
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
pmullw xmm1, xmm1
|
||||
pmullw xmm2, xmm2
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
punpcklwd xmm1, xmm7
|
||||
punpckhwd xmm3, xmm7
|
||||
punpcklwd xmm2, xmm7
|
||||
punpckhwd xmm4, xmm7
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm3, xmm4
|
||||
paddd xmm1, xmm3
|
||||
pshufd xmm2, xmm1, 01Bh
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
|
||||
movd r2d, xmm0
|
||||
and r2, 0ffffh ; effective low work truncated
|
||||
mov r3, r2
|
||||
imul r2, r3
|
||||
sar r2, $04
|
||||
movd retrd, xmm1
|
||||
sub retrd, r2d
|
||||
movd r2d, xmm0
|
||||
and r2, 0ffffh ; effective low work truncated
|
||||
mov r3, r2
|
||||
imul r2, r3
|
||||
sar r2, $04
|
||||
movd retrd, xmm1
|
||||
sub retrd, r2d
|
||||
|
||||
add r7,32
|
||||
add r7,r5
|
||||
add r7,32
|
||||
add r7,r5
|
||||
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
POP_XMM
|
||||
POP_XMM
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
|
||||
; int32_t AnalysisVaaInfoIntra_ssse3( uint8_t *pDataY, const int32_t iLineSize );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
||||
|
||||
@ -269,47 +269,47 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
||||
mov r4,r2
|
||||
sal r4,$01 ;r4 = 4*iLineSize
|
||||
|
||||
pxor xmm7, xmm7
|
||||
pxor xmm7, xmm7
|
||||
|
||||
; loops
|
||||
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
; loops
|
||||
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7],xmm0
|
||||
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
||||
movq [r7+8],xmm1
|
||||
|
||||
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5
|
||||
movq [r7+16],xmm0
|
||||
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
||||
lea r0,[r0+r4]
|
||||
VAA_AVG_BLOCK_SSSE3 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6
|
||||
movq [r7+24],xmm1
|
||||
|
||||
|
||||
movdqa xmm0,[r7]
|
||||
movdqa xmm1,[r7+16]
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
|
||||
movdqa xmm0,[r7]
|
||||
movdqa xmm1,[r7+16]
|
||||
movdqa xmm2, xmm0
|
||||
paddw xmm0, xmm1
|
||||
SUM_WORD_8x2_SSE2 xmm0, xmm3 ; better performance than that of phaddw sets
|
||||
|
||||
pmullw xmm1, xmm1
|
||||
pmullw xmm2, xmm2
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
punpcklwd xmm1, xmm7
|
||||
punpckhwd xmm3, xmm7
|
||||
punpcklwd xmm2, xmm7
|
||||
punpckhwd xmm4, xmm7
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm3, xmm4
|
||||
paddd xmm1, xmm3
|
||||
pshufd xmm2, xmm1, 01Bh
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
pmullw xmm1, xmm1
|
||||
pmullw xmm2, xmm2
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
punpcklwd xmm1, xmm7
|
||||
punpckhwd xmm3, xmm7
|
||||
punpcklwd xmm2, xmm7
|
||||
punpckhwd xmm4, xmm7
|
||||
paddd xmm1, xmm2
|
||||
paddd xmm3, xmm4
|
||||
paddd xmm1, xmm3
|
||||
pshufd xmm2, xmm1, 01Bh
|
||||
paddd xmm1, xmm2
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
|
||||
|
||||
movd r2d, xmm0
|
||||
@ -318,94 +318,94 @@ WELS_EXTERN AnalysisVaaInfoIntra_ssse3
|
||||
imul r2, r3
|
||||
sar r2, $04
|
||||
movd retrd, xmm1
|
||||
sub retrd, r2d
|
||||
sub retrd, r2d
|
||||
|
||||
add r7,32
|
||||
add r7,r5
|
||||
add r7,32
|
||||
add r7,r5
|
||||
%ifdef X86_32
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
pop r6
|
||||
pop r5
|
||||
pop r4
|
||||
pop r3
|
||||
%endif
|
||||
POP_XMM
|
||||
POP_XMM
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
|
||||
; uint8_t MdInterAnalysisVaaInfo_sse41( int32_t *pSad8x8 )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN MdInterAnalysisVaaInfo_sse41
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0,[r0]
|
||||
pshufd xmm1, xmm0, 01Bh
|
||||
paddd xmm1, xmm0
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
psrad xmm1, 02h ; iAverageSad
|
||||
movdqa xmm2, xmm1
|
||||
psrad xmm2, 06h
|
||||
movdqa xmm3, xmm0 ; iSadBlock
|
||||
psrad xmm3, 06h
|
||||
psubd xmm3, xmm2
|
||||
pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
|
||||
pshufd xmm4, xmm3, 01Bh
|
||||
paddd xmm4, xmm3
|
||||
pshufd xmm3, xmm4, 0B1h
|
||||
paddd xmm3, xmm4
|
||||
movd r0d, xmm3
|
||||
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0,[r0]
|
||||
pshufd xmm1, xmm0, 01Bh
|
||||
paddd xmm1, xmm0
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
psrad xmm1, 02h ; iAverageSad
|
||||
movdqa xmm2, xmm1
|
||||
psrad xmm2, 06h
|
||||
movdqa xmm3, xmm0 ; iSadBlock
|
||||
psrad xmm3, 06h
|
||||
psubd xmm3, xmm2
|
||||
pmulld xmm3, xmm3 ; [comment]: pmulld from SSE4.1 instruction sets
|
||||
pshufd xmm4, xmm3, 01Bh
|
||||
paddd xmm4, xmm3
|
||||
pshufd xmm3, xmm4, 0B1h
|
||||
paddd xmm3, xmm4
|
||||
movd r0d, xmm3
|
||||
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
||||
|
||||
jb near .threshold_exit
|
||||
pshufd xmm0, xmm0, 01Bh
|
||||
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
||||
movmskps retrd, xmm0
|
||||
ret
|
||||
jb near .threshold_exit
|
||||
pshufd xmm0, xmm0, 01Bh
|
||||
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
||||
movmskps retrd, xmm0
|
||||
ret
|
||||
.threshold_exit:
|
||||
mov retrd, 15
|
||||
ret
|
||||
mov retrd, 15
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
|
||||
; uint8_t MdInterAnalysisVaaInfo_sse2( int32_t *pSad8x8 )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN MdInterAnalysisVaaInfo_sse2
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
pshufd xmm1, xmm0, 01Bh
|
||||
paddd xmm1, xmm0
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
psrad xmm1, 02h ; iAverageSad
|
||||
movdqa xmm2, xmm1
|
||||
psrad xmm2, 06h
|
||||
movdqa xmm3, xmm0 ; iSadBlock
|
||||
psrad xmm3, 06h
|
||||
psubd xmm3, xmm2
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
pshufd xmm1, xmm0, 01Bh
|
||||
paddd xmm1, xmm0
|
||||
pshufd xmm2, xmm1, 0B1h
|
||||
paddd xmm1, xmm2
|
||||
psrad xmm1, 02h ; iAverageSad
|
||||
movdqa xmm2, xmm1
|
||||
psrad xmm2, 06h
|
||||
movdqa xmm3, xmm0 ; iSadBlock
|
||||
psrad xmm3, 06h
|
||||
psubd xmm3, xmm2
|
||||
|
||||
; to replace pmulld functionality as below
|
||||
movdqa xmm2, xmm3
|
||||
pmuludq xmm2, xmm3
|
||||
pshufd xmm4, xmm3, 0B1h
|
||||
pmuludq xmm4, xmm4
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm5, xmm4
|
||||
punpckhdq xmm2, xmm4
|
||||
punpcklqdq xmm5, xmm2
|
||||
; to replace pmulld functionality as below
|
||||
movdqa xmm2, xmm3
|
||||
pmuludq xmm2, xmm3
|
||||
pshufd xmm4, xmm3, 0B1h
|
||||
pmuludq xmm4, xmm4
|
||||
movdqa xmm5, xmm2
|
||||
punpckldq xmm5, xmm4
|
||||
punpckhdq xmm2, xmm4
|
||||
punpcklqdq xmm5, xmm2
|
||||
|
||||
pshufd xmm4, xmm5, 01Bh
|
||||
paddd xmm4, xmm5
|
||||
pshufd xmm5, xmm4, 0B1h
|
||||
paddd xmm5, xmm4
|
||||
pshufd xmm4, xmm5, 01Bh
|
||||
paddd xmm4, xmm5
|
||||
pshufd xmm5, xmm4, 0B1h
|
||||
paddd xmm5, xmm4
|
||||
|
||||
movd r0d, xmm5
|
||||
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
||||
jb near .threshold_exit
|
||||
pshufd xmm0, xmm0, 01Bh
|
||||
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
||||
movmskps retrd, xmm0
|
||||
ret
|
||||
movd r0d, xmm5
|
||||
cmp r0d, 20 ; INTER_VARIANCE_SAD_THRESHOLD
|
||||
jb near .threshold_exit
|
||||
pshufd xmm0, xmm0, 01Bh
|
||||
pcmpgtd xmm0, xmm1 ; iSadBlock > iAverageSad
|
||||
movmskps retrd, xmm0
|
||||
ret
|
||||
.threshold_exit:
|
||||
mov retrd, 15
|
||||
ret
|
||||
mov retrd, 15
|
||||
ret
|
||||
|
@ -36,128 +36,128 @@
|
||||
#ifdef __APPLE__
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
|
||||
vshr.s16 $8, $1, #1
|
||||
vshr.s16 $9, $3, #1
|
||||
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
|
||||
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 $4, $0, $2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 $5, $0, $2 //int32 e[i][1] = src[0] - src[2];
|
||||
vshr.s16 $8, $1, #1
|
||||
vshr.s16 $9, $3, #1
|
||||
vsubl.s16 $6, $8, $3 //int32 e[i][2] = (src[1]>>1)-src[3];
|
||||
vaddl.s16 $7, $1, $9 //int32 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.macro TRANSFORM_4BYTES // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 $0, $4, $7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 $1, $5, $6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
vsub.s32 $2, $5, $6 //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
vsub.s32 $3, $4, $7 //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
vshr.s32 $6, $1, #1
|
||||
vshr.s32 $7, $3, #1
|
||||
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 $4, $0, $2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 $5, $0, $2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
vshr.s32 $6, $1, #1
|
||||
vshr.s32 $7, $3, #1
|
||||
vsub.s32 $6, $6, $3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
vadd.s32 $7, $1, $7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
|
||||
vshr.s16 \arg8, \arg1, #1
|
||||
vshr.s16 \arg9, \arg3, #1
|
||||
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
|
||||
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
// { // input: src_d[0]~[3], output: e_q[0]~[3]; working: $8 $9
|
||||
vaddl.s16 \arg4, \arg0, \arg2 //int32 e[i][0] = src[0] + src[2];
|
||||
vsubl.s16 \arg5, \arg0, \arg2 //int32 e[i][1] = src[0] - src[2];
|
||||
vshr.s16 \arg8, \arg1, #1
|
||||
vshr.s16 \arg9, \arg3, #1
|
||||
vsubl.s16 \arg6, \arg8, \arg3 //int32 e[i][2] = (src[1]>>1)-src[3];
|
||||
vaddl.s16 \arg7, \arg1, \arg9 //int32 e[i][3] = src[1] + (src[3]>>1);
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 // both row & col transform used
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
// { // output: f_q[0]~[3], input: e_q[0]~[3];
|
||||
vadd.s32 \arg0, \arg4, \arg7 //int16 f[i][0] = e[i][0] + e[i][3];
|
||||
vadd.s32 \arg1, \arg5, \arg6 //int16 f[i][1] = e[i][1] + e[i][2];
|
||||
vsub.s32 \arg2, \arg5, \arg6 //int16 f[i][2] = e[i][1] - e[i][2];
|
||||
vsub.s32 \arg3, \arg4, \arg7 //int16 f[i][3] = e[i][0] - e[i][3];
|
||||
// }
|
||||
.endm
|
||||
|
||||
.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
vshr.s32 \arg6, \arg1, #1
|
||||
vshr.s32 \arg7, \arg3, #1
|
||||
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
// { // input: src_q[0]~[3], output: e_q[0]~[3];
|
||||
vadd.s32 \arg4, \arg0, \arg2 //int32 e[0][j] = f[0][j] + f[2][j];
|
||||
vsub.s32 \arg5, \arg0, \arg2 //int32 e[1][j] = f[0][j] - f[2][j];
|
||||
vshr.s32 \arg6, \arg1, #1
|
||||
vshr.s32 \arg7, \arg3, #1
|
||||
vsub.s32 \arg6, \arg6, \arg3 //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
|
||||
vadd.s32 \arg7, \arg1, \arg7 //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
|
||||
// }
|
||||
.endm
|
||||
#endif
|
||||
// r0 int16_t* block,
|
||||
// r1 int8_t* non_zero_count,
|
||||
WELS_ASM_FUNC_BEGIN SetNonZeroCount_neon
|
||||
|
||||
vld1.64 {d0-d2}, [r1]
|
||||
vld1.64 {d0-d2}, [r1]
|
||||
|
||||
vceq.s8 q0, q0, #0
|
||||
vceq.s8 d2, d2, #0
|
||||
vmvn q0, q0
|
||||
vmvn d2, d2
|
||||
vabs.s8 q0, q0
|
||||
vabs.s8 d2, d2
|
||||
vceq.s8 q0, q0, #0
|
||||
vceq.s8 d2, d2, #0
|
||||
vmvn q0, q0
|
||||
vmvn d2, d2
|
||||
vabs.s8 q0, q0
|
||||
vabs.s8 d2, d2
|
||||
|
||||
vst1.64 {d0-d2}, [r1]
|
||||
vst1.64 {d0-d2}, [r1]
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
// uint8_t *pred, const int32_t stride, int16_t *rs
|
||||
// uint8_t *pred, const int32_t stride, int16_t *rs
|
||||
WELS_ASM_FUNC_BEGIN IdctResAddPred_neon
|
||||
|
||||
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
|
||||
vld4.s16 {d0, d1, d2, d3}, [r2] // cost 3 cycles!
|
||||
|
||||
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
|
||||
ROW_TRANSFORM_1_STEP d0, d1, d2, d3, q8, q9, q10, q11, d4, d5
|
||||
|
||||
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
|
||||
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
|
||||
|
||||
// transform element 32bits
|
||||
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
|
||||
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
|
||||
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
|
||||
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
|
||||
// transform element 32bits
|
||||
vtrn.s32 q0, q1 //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
|
||||
vtrn.s32 q2, q3 //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
|
||||
vswp d1, d4 //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
|
||||
vswp d3, d6 //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
|
||||
|
||||
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
|
||||
COL_TRANSFORM_1_STEP q0, q1, q2, q3, q8, q9, q10, q11
|
||||
|
||||
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
|
||||
TRANSFORM_4BYTES q0, q1, q2, q3, q8, q9, q10, q11
|
||||
|
||||
//after clip_table[MAX_NEG_CROP] into [0, 255]
|
||||
mov r2, r0
|
||||
vld1.32 {d20[0]},[r0],r1
|
||||
vld1.32 {d20[1]},[r0],r1
|
||||
vld1.32 {d22[0]},[r0],r1
|
||||
vld1.32 {d22[1]},[r0]
|
||||
//after clip_table[MAX_NEG_CROP] into [0, 255]
|
||||
mov r2, r0
|
||||
vld1.32 {d20[0]},[r0],r1
|
||||
vld1.32 {d20[1]},[r0],r1
|
||||
vld1.32 {d22[0]},[r0],r1
|
||||
vld1.32 {d22[1]},[r0]
|
||||
|
||||
vrshrn.s32 d16, q0, #6
|
||||
vrshrn.s32 d17, q1, #6
|
||||
vrshrn.s32 d18, q2, #6
|
||||
vrshrn.s32 d19, q3, #6
|
||||
vrshrn.s32 d16, q0, #6
|
||||
vrshrn.s32 d17, q1, #6
|
||||
vrshrn.s32 d18, q2, #6
|
||||
vrshrn.s32 d19, q3, #6
|
||||
|
||||
vmovl.u8 q0,d20
|
||||
vmovl.u8 q1,d22
|
||||
vadd.s16 q0,q8
|
||||
vadd.s16 q1,q9
|
||||
vmovl.u8 q0,d20
|
||||
vmovl.u8 q1,d22
|
||||
vadd.s16 q0,q8
|
||||
vadd.s16 q1,q9
|
||||
|
||||
vqmovun.s16 d20,q0
|
||||
vqmovun.s16 d22,q1
|
||||
vqmovun.s16 d20,q0
|
||||
vqmovun.s16 d22,q1
|
||||
|
||||
vst1.32 {d20[0]},[r2],r1
|
||||
vst1.32 {d20[1]},[r2],r1
|
||||
vst1.32 {d22[0]},[r2],r1
|
||||
vst1.32 {d22[1]},[r2]
|
||||
vst1.32 {d20[0]},[r2],r1
|
||||
vst1.32 {d20[1]},[r2],r1
|
||||
vst1.32 {d22[0]},[r2],r1
|
||||
vst1.32 {d22[1]},[r2]
|
||||
WELS_ASM_FUNC_END
|
||||
#endif
|
||||
|
@ -38,104 +38,104 @@
|
||||
#ifdef __APPLE__
|
||||
//Global macro
|
||||
.macro GET_8BYTE_DATA
|
||||
vld1.8 {$0[0]}, [$1], $2
|
||||
vld1.8 {$0[1]}, [$1], $2
|
||||
vld1.8 {$0[2]}, [$1], $2
|
||||
vld1.8 {$0[3]}, [$1], $2
|
||||
vld1.8 {$0[4]}, [$1], $2
|
||||
vld1.8 {$0[5]}, [$1], $2
|
||||
vld1.8 {$0[6]}, [$1], $2
|
||||
vld1.8 {$0[7]}, [$1], $2
|
||||
vld1.8 {$0[0]}, [$1], $2
|
||||
vld1.8 {$0[1]}, [$1], $2
|
||||
vld1.8 {$0[2]}, [$1], $2
|
||||
vld1.8 {$0[3]}, [$1], $2
|
||||
vld1.8 {$0[4]}, [$1], $2
|
||||
vld1.8 {$0[5]}, [$1], $2
|
||||
vld1.8 {$0[6]}, [$1], $2
|
||||
vld1.8 {$0[7]}, [$1], $2
|
||||
.endmacro
|
||||
#else
|
||||
//Global macro
|
||||
.macro GET_8BYTE_DATA arg0, arg1, arg2
|
||||
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[2]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[3]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[4]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[5]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[6]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[7]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[2]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[3]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[4]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[5]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[6]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[7]}, [\arg1], \arg2
|
||||
.endm
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredV_neon
|
||||
//Get the top line data to 'q0'
|
||||
sub r2, r0, r1
|
||||
vldm r2, {d0, d1}
|
||||
//Get the top line data to 'q0'
|
||||
sub r2, r0, r1
|
||||
vldm r2, {d0, d1}
|
||||
|
||||
mov r2, r0
|
||||
mov r3, #4
|
||||
//Set the top line to the each line of MB(16*16)
|
||||
mov r2, r0
|
||||
mov r3, #4
|
||||
//Set the top line to the each line of MB(16*16)
|
||||
loop_0_get_i16x16_luma_pred_v:
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_v
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
vst1.8 {d0,d1}, [r2], r1
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_v
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredH_neon
|
||||
sub r2, r0, #1
|
||||
mov r3, #4
|
||||
sub r2, r0, #1
|
||||
mov r3, #4
|
||||
loop_0_get_i16x16_luma_pred_h:
|
||||
//Get one byte data from left side
|
||||
vld1.8 {d0[],d1[]}, [r2], r1
|
||||
vld1.8 {d2[],d3[]}, [r2], r1
|
||||
vld1.8 {d4[],d5[]}, [r2], r1
|
||||
vld1.8 {d6[],d7[]}, [r2], r1
|
||||
//Get one byte data from left side
|
||||
vld1.8 {d0[],d1[]}, [r2], r1
|
||||
vld1.8 {d2[],d3[]}, [r2], r1
|
||||
vld1.8 {d4[],d5[]}, [r2], r1
|
||||
vld1.8 {d6[],d7[]}, [r2], r1
|
||||
|
||||
//Set the line of MB using the left side byte data
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d2,d3}, [r0], r1
|
||||
vst1.8 {d4,d5}, [r0], r1
|
||||
vst1.8 {d6,d7}, [r0], r1
|
||||
//Set the line of MB using the left side byte data
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d2,d3}, [r0], r1
|
||||
vst1.8 {d4,d5}, [r0], r1
|
||||
vst1.8 {d6,d7}, [r0], r1
|
||||
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_h
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_h
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredDc_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the left vertical line data
|
||||
sub r2, r0, #1
|
||||
GET_8BYTE_DATA d0, r2, r1
|
||||
GET_8BYTE_DATA d1, r2, r1
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the left vertical line data
|
||||
sub r2, r0, #1
|
||||
GET_8BYTE_DATA d0, r2, r1
|
||||
GET_8BYTE_DATA d1, r2, r1
|
||||
|
||||
//Get the top horizontal line data
|
||||
sub r2, r0, r1
|
||||
vldm r2, {d2, d3}
|
||||
//Get the top horizontal line data
|
||||
sub r2, r0, r1
|
||||
vldm r2, {d2, d3}
|
||||
|
||||
//Calculate the sum of top horizontal line data and vertical line data
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
//Calculate the sum of top horizontal line data and vertical line data
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
|
||||
//Set the mean value to the all of member of MB
|
||||
mov r2, #4
|
||||
//Set the mean value to the all of member of MB
|
||||
mov r2, #4
|
||||
loop_0_get_i16x16_luma_pred_dc_both:
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
subs r2, #1
|
||||
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
vst1.8 {d0,d1}, [r0], r1
|
||||
subs r2, #1
|
||||
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -149,386 +149,386 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI16x16LumaPredPlane_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
|
||||
//Load the table {(8,7,6,5,4,3,2,1) * 5}
|
||||
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
|
||||
vldr d0, [r2]
|
||||
//Load the table {(8,7,6,5,4,3,2,1) * 5}
|
||||
adr r2, CONST0_GET_I16X16_LUMA_PRED_PLANE
|
||||
vldr d0, [r2]
|
||||
|
||||
//Pack the top[-1] ~ top[6] to d1
|
||||
sub r2, r0, r1
|
||||
sub r3, r2, #1
|
||||
vld1.8 d1, [r3]
|
||||
//Pack the top[-1] ~ top[6] to d1
|
||||
sub r2, r0, r1
|
||||
sub r3, r2, #1
|
||||
vld1.8 d1, [r3]
|
||||
|
||||
//Pack the top[8] ~ top[15] to d2
|
||||
add r3, #9
|
||||
vld1.8 d2, [r3]
|
||||
//Pack the top[8] ~ top[15] to d2
|
||||
add r3, #9
|
||||
vld1.8 d2, [r3]
|
||||
|
||||
//Save the top[15] to d6 for next step
|
||||
vdup.u8 d6, d2[7]
|
||||
//Save the top[15] to d6 for next step
|
||||
vdup.u8 d6, d2[7]
|
||||
|
||||
//Get and pack left[-1] ~ left[6] to d4
|
||||
sub r3, r2, #1
|
||||
GET_8BYTE_DATA d4, r3, r1
|
||||
//Get and pack left[-1] ~ left[6] to d4
|
||||
sub r3, r2, #1
|
||||
GET_8BYTE_DATA d4, r3, r1
|
||||
|
||||
//Get and pack left[8] ~ left[15] to d3
|
||||
add r3, r1
|
||||
GET_8BYTE_DATA d3, r3, r1
|
||||
//Get and pack left[8] ~ left[15] to d3
|
||||
add r3, r1
|
||||
GET_8BYTE_DATA d3, r3, r1
|
||||
|
||||
//Save the left[15] to d7 for next step
|
||||
vdup.u8 d7, d3[7]
|
||||
//Save the left[15] to d7 for next step
|
||||
vdup.u8 d7, d3[7]
|
||||
|
||||
//revert the sequence of d2,d3
|
||||
vrev64.8 q1, q1
|
||||
//revert the sequence of d2,d3
|
||||
vrev64.8 q1, q1
|
||||
|
||||
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
|
||||
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
|
||||
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
|
||||
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
|
||||
|
||||
|
||||
vmovl.u8 q0, d0
|
||||
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmovl.u8 q0, d0
|
||||
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
|
||||
|
||||
//Calculate the sum of items of q1, q2
|
||||
vpadd.s16 d0, d2, d3
|
||||
vpadd.s16 d1, d4, d5
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
//Calculate the sum of items of q1, q2
|
||||
vpadd.s16 d0, d2, d3
|
||||
vpadd.s16 d1, d4, d5
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
|
||||
//Get the value of 'b', 'c' and extend to q1, q2.
|
||||
vrshr.s64 q0, #6
|
||||
vdup.s16 q1, d0[0]
|
||||
vdup.s16 q2, d1[0]
|
||||
//Get the value of 'b', 'c' and extend to q1, q2.
|
||||
vrshr.s64 q0, #6
|
||||
vdup.s16 q1, d0[0]
|
||||
vdup.s16 q2, d1[0]
|
||||
|
||||
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
|
||||
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
|
||||
vld1.32 {d0}, [r2]
|
||||
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
|
||||
adr r2, CONST1_GET_I16X16_LUMA_PRED_PLANE
|
||||
vld1.32 {d0}, [r2]
|
||||
|
||||
//Get the value of 'a' and save to q3
|
||||
vaddl.u8 q3, d6, d7
|
||||
vshl.u16 q3, #4
|
||||
//Get the value of 'a' and save to q3
|
||||
vaddl.u8 q3, d6, d7
|
||||
vshl.u16 q3, #4
|
||||
|
||||
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
|
||||
vmovl.s8 q0, d0
|
||||
vmla.s16 q3, q0, q1
|
||||
vmla.s16 q3, q2, d0[0]
|
||||
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
|
||||
vmovl.s8 q0, d0
|
||||
vmla.s16 q3, q0, q1
|
||||
vmla.s16 q3, q2, d0[0]
|
||||
|
||||
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
|
||||
vshl.s16 q8, q1, #3
|
||||
vadd.s16 q8, q3
|
||||
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
|
||||
vshl.s16 q8, q1, #3
|
||||
vadd.s16 q8, q3
|
||||
|
||||
//right shift 5 bits and rounding
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
//right shift 5 bits and rounding
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
|
||||
//Set the line of MB
|
||||
vst1.u32 {d0,d1}, [r0], r1
|
||||
//Set the line of MB
|
||||
vst1.u32 {d0,d1}, [r0], r1
|
||||
|
||||
|
||||
//Do the same processing for setting other lines
|
||||
mov r2, #15
|
||||
//Do the same processing for setting other lines
|
||||
mov r2, #15
|
||||
loop_0_get_i16x16_luma_pred_plane:
|
||||
vadd.s16 q3, q2
|
||||
vadd.s16 q8, q2
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
vst1.u32 {d0,d1}, [r0], r1
|
||||
subs r2, #1
|
||||
bne loop_0_get_i16x16_luma_pred_plane
|
||||
vadd.s16 q3, q2
|
||||
vadd.s16 q8, q2
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
vst1.u32 {d0,d1}, [r0], r1
|
||||
subs r2, #1
|
||||
bne loop_0_get_i16x16_luma_pred_plane
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredV_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r2, r0, r1
|
||||
ldr r2, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r2, r0, r1
|
||||
ldr r2, [r2]
|
||||
|
||||
//Set the luma MB using top line
|
||||
str r2, [r0], r1
|
||||
str r2, [r0], r1
|
||||
str r2, [r0], r1
|
||||
str r2, [r0]
|
||||
//Set the luma MB using top line
|
||||
str r2, [r0], r1
|
||||
str r2, [r0], r1
|
||||
str r2, [r0], r1
|
||||
str r2, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredH_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column (4 bytes)
|
||||
sub r2, r0, #1
|
||||
vld1.8 {d0[]}, [r2], r1
|
||||
vld1.8 {d1[]}, [r2], r1
|
||||
vld1.8 {d2[]}, [r2], r1
|
||||
vld1.8 {d3[]}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column (4 bytes)
|
||||
sub r2, r0, #1
|
||||
vld1.8 {d0[]}, [r2], r1
|
||||
vld1.8 {d1[]}, [r2], r1
|
||||
vld1.8 {d2[]}, [r2], r1
|
||||
vld1.8 {d3[]}, [r2]
|
||||
|
||||
//Set the luma MB using the left side byte
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d1[0]}, [r0], r1
|
||||
vst1.32 {d2[0]}, [r0], r1
|
||||
vst1.32 {d3[0]}, [r0]
|
||||
//Set the luma MB using the left side byte
|
||||
vst1.32 {d0[0]}, [r0], r1
|
||||
vst1.32 {d1[0]}, [r0], r1
|
||||
vst1.32 {d2[0]}, [r0], r1
|
||||
vst1.32 {d3[0]}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDL_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data(8 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data(8 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0}, [r2]
|
||||
|
||||
//For "t7 + (t7<<1)"
|
||||
vdup.8 d1, d0[7]
|
||||
//For "t7 + (t7<<1)"
|
||||
vdup.8 d1, d0[7]
|
||||
|
||||
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
|
||||
vext.8 d1, d0, d1, #1
|
||||
vaddl.u8 q1, d1, d0
|
||||
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
|
||||
vext.8 d1, d0, d1, #1
|
||||
vaddl.u8 q1, d1, d0
|
||||
|
||||
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
|
||||
vext.8 q2, q1, q1, #14
|
||||
vadd.u16 q0, q1, q2
|
||||
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
|
||||
vext.8 q2, q1, q1, #14
|
||||
vadd.u16 q0, q1, q2
|
||||
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q0, #2
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q0, #2
|
||||
|
||||
//Save "ddl0, ddl1, ddl2, ddl3"
|
||||
vext.8 d1, d0, d0, #1
|
||||
vst1.32 d1[0], [r0], r1
|
||||
//Save "ddl0, ddl1, ddl2, ddl3"
|
||||
vext.8 d1, d0, d0, #1
|
||||
vst1.32 d1[0], [r0], r1
|
||||
|
||||
//Save "ddl1, ddl2, ddl3, ddl4"
|
||||
vext.8 d1, d0, d0, #2
|
||||
vst1.32 d1[0], [r0], r1
|
||||
//Save "ddl1, ddl2, ddl3, ddl4"
|
||||
vext.8 d1, d0, d0, #2
|
||||
vst1.32 d1[0], [r0], r1
|
||||
|
||||
//Save "ddl2, ddl3, ddl4, ddl5"
|
||||
vext.8 d1, d0, d0, #3
|
||||
vst1.32 d1[0], [r0], r1
|
||||
//Save "ddl2, ddl3, ddl4, ddl5"
|
||||
vext.8 d1, d0, d0, #3
|
||||
vst1.32 d1[0], [r0], r1
|
||||
|
||||
//Save "ddl3, ddl4, ddl5, ddl6"
|
||||
vst1.32 d0[1], [r0]
|
||||
//Save "ddl3, ddl4, ddl5, ddl6"
|
||||
vst1.32 d0[1], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredDDR_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0[1]}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0[1]}, [r2]
|
||||
|
||||
//Load the left column (5 bytes)
|
||||
sub r2, #1
|
||||
vld1.8 {d0[3]}, [r2], r1
|
||||
vld1.8 {d0[2]}, [r2], r1
|
||||
vld1.8 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[0]}, [r2], r1
|
||||
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
|
||||
//Load the left column (5 bytes)
|
||||
sub r2, #1
|
||||
vld1.8 {d0[3]}, [r2], r1
|
||||
vld1.8 {d0[2]}, [r2], r1
|
||||
vld1.8 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[0]}, [r2], r1
|
||||
vld1.8 {d1[7]}, [r2] //For packing the right sequence to do SIMD processing
|
||||
|
||||
|
||||
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
|
||||
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
|
||||
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
|
||||
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
|
||||
vaddl.u8 q2, d2, d0
|
||||
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
|
||||
vaddl.u8 q2, d2, d0
|
||||
|
||||
//q1:{TL0+LT0,LT0+T01,...L12+L23}
|
||||
vext.8 q3, q3, q2, #14
|
||||
vadd.u16 q1, q2, q3
|
||||
//q1:{TL0+LT0,LT0+T01,...L12+L23}
|
||||
vext.8 q3, q3, q2, #14
|
||||
vadd.u16 q1, q2, q3
|
||||
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q1, #2
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q1, #2
|
||||
|
||||
//Adjust the data sequence for setting luma MB of 'pred'
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]
|
||||
//Adjust the data sequence for setting luma MB of 'pred'
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVL_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (8 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (8 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0}, [r2]
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
|
||||
|
||||
vext.8 q2, q1, q1, #2
|
||||
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
|
||||
vext.8 q2, q1, q1, #2
|
||||
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
|
||||
|
||||
//calculate the "vl0,vl1,vl2,vl3,vl4"
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
//calculate the "vl0,vl1,vl2,vl3,vl4"
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
|
||||
//calculate the "vl5,vl6,vl7,vl8,vl9"
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
//calculate the "vl5,vl6,vl7,vl8,vl9"
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[0], [r0], r1
|
||||
vst1.32 d1[0], [r0], r1
|
||||
vext.8 d0, d0, d0, #1
|
||||
vext.8 d1, d1, d1, #1
|
||||
vst1.32 d0[0], [r0], r1
|
||||
vst1.32 d1[0], [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[0], [r0], r1
|
||||
vst1.32 d1[0], [r0], r1
|
||||
vext.8 d0, d0, d0, #1
|
||||
vext.8 d1, d1, d1, #1
|
||||
vst1.32 d0[0], [r0], r1
|
||||
vst1.32 d1[0], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredVR_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0[1]}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r2, r0, r1
|
||||
vld1.32 {d0[1]}, [r2]
|
||||
|
||||
//Load the left column (4 bytes)
|
||||
sub r2, #1
|
||||
vld1.8 {d0[3]}, [r2], r1
|
||||
vld1.8 {d0[2]}, [r2], r1
|
||||
vld1.8 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[0]}, [r2]
|
||||
//Load the left column (4 bytes)
|
||||
sub r2, #1
|
||||
vld1.8 {d0[3]}, [r2], r1
|
||||
vld1.8 {d0[2]}, [r2], r1
|
||||
vld1.8 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[0]}, [r2]
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
|
||||
|
||||
vext.u8 q2, q1, q1, #14
|
||||
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
|
||||
vext.u8 q2, q1, q1, #14
|
||||
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
|
||||
|
||||
//Calculate the vr0 ~ vr9
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
//Calculate the vr0 ~ vr9
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vst1.32 d1[1], [r0], r1
|
||||
add r2, r0, r1
|
||||
vst1.8 d1[3], [r0]!
|
||||
vst1.16 d0[2], [r0]!
|
||||
vst1.8 d0[6], [r0]!
|
||||
vst1.8 d1[2], [r2]!
|
||||
vst1.16 d1[2], [r2]!
|
||||
vst1.8 d1[6], [r2]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[1], [r0], r1
|
||||
vst1.32 d1[1], [r0], r1
|
||||
add r2, r0, r1
|
||||
vst1.8 d1[3], [r0]!
|
||||
vst1.16 d0[2], [r0]!
|
||||
vst1.8 d0[6], [r0]!
|
||||
vst1.8 d1[2], [r2]!
|
||||
vst1.16 d1[2], [r2]!
|
||||
vst1.8 d1[6], [r2]
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHU_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column data
|
||||
sub r2, r0, #1
|
||||
mov r3, #3
|
||||
mul r3, r1
|
||||
add r3, r2
|
||||
vld1.8 {d0[]}, [r3]
|
||||
vld1.8 {d0[4]}, [r2], r1
|
||||
vld1.8 {d0[5]}, [r2], r1
|
||||
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column data
|
||||
sub r2, r0, #1
|
||||
mov r3, #3
|
||||
mul r3, r1
|
||||
add r3, r2
|
||||
vld1.8 {d0[]}, [r3]
|
||||
vld1.8 {d0[4]}, [r2], r1
|
||||
vld1.8 {d0[5]}, [r2], r1
|
||||
vld1.8 {d0[6]}, [r2], r1 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||
|
||||
vext.u8 d2, d5, d4, #2
|
||||
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||
vext.u8 d2, d5, d4, #2
|
||||
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||
|
||||
//Calculate the hu0 ~ hu5
|
||||
vqrshrn.u16 d2, q2, #1
|
||||
vqrshrn.u16 d1, q1, #2
|
||||
//Calculate the hu0 ~ hu5
|
||||
vqrshrn.u16 d2, q2, #1
|
||||
vqrshrn.u16 d1, q1, #2
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vzip.8 d2, d1
|
||||
vst1.32 d1[0], [r0], r1
|
||||
vext.8 d2, d1, d1, #2
|
||||
vst1.32 d2[0], [r0], r1
|
||||
vst1.32 d1[1], [r0], r1
|
||||
vst1.32 d0[0], [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vzip.8 d2, d1
|
||||
vst1.32 d1[0], [r0], r1
|
||||
vext.8 d2, d1, d1, #2
|
||||
vst1.32 d2[0], [r0], r1
|
||||
vst1.32 d1[1], [r0], r1
|
||||
vst1.32 d0[0], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderI4x4LumaPredHD_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the data
|
||||
sub r2, r0, r1
|
||||
sub r2, #1
|
||||
vld1.32 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[3]}, [r2], r1
|
||||
vld1.8 {d0[2]}, [r2], r1
|
||||
vld1.8 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the data
|
||||
sub r2, r0, r1
|
||||
sub r2, #1
|
||||
vld1.32 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[3]}, [r2], r1
|
||||
vld1.8 {d0[2]}, [r2], r1
|
||||
vld1.8 {d0[1]}, [r2], r1
|
||||
vld1.8 {d0[0]}, [r2] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
|
||||
|
||||
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
|
||||
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
|
||||
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
|
||||
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
|
||||
|
||||
//Calculate the hd0~hd9
|
||||
vqrshrn.u16 d1, q3, #2
|
||||
vqrshrn.u16 d0, q2, #1
|
||||
//Calculate the hd0~hd9
|
||||
vqrshrn.u16 d1, q3, #2
|
||||
vqrshrn.u16 d0, q2, #1
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vmov d3, d1
|
||||
vtrn.8 d0, d1
|
||||
vext.u8 d2, d1, d1, #6
|
||||
vst2.16 {d2[3], d3[3]}, [r0], r1
|
||||
vst2.16 {d0[2], d1[2]}, [r0], r1
|
||||
vmov d3, d0
|
||||
vst2.16 {d2[2], d3[2]}, [r0], r1
|
||||
vst2.16 {d0[1], d1[1]}, [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vmov d3, d1
|
||||
vtrn.8 d0, d1
|
||||
vext.u8 d2, d1, d1, #6
|
||||
vst2.16 {d2[3], d3[3]}, [r0], r1
|
||||
vst2.16 {d0[2], d1[2]}, [r0], r1
|
||||
vmov d3, d0
|
||||
vst2.16 {d2[2], d3[2]}, [r0], r1
|
||||
vst2.16 {d0[1], d1[1]}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredV_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the top row (8 byte)
|
||||
sub r2, r0, r1
|
||||
vldr d0, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the top row (8 byte)
|
||||
sub r2, r0, r1
|
||||
vldr d0, [r2]
|
||||
|
||||
//Set the chroma MB using top row data
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0]
|
||||
//Set the chroma MB using top row data
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d0}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredH_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
////Get the left column (8 byte)
|
||||
sub r2, r0, #1
|
||||
vld1.8 {d0[]}, [r2], r1
|
||||
vld1.8 {d1[]}, [r2], r1
|
||||
vld1.8 {d2[]}, [r2], r1
|
||||
vld1.8 {d3[]}, [r2], r1
|
||||
vld1.8 {d4[]}, [r2], r1
|
||||
vld1.8 {d5[]}, [r2], r1
|
||||
vld1.8 {d6[]}, [r2], r1
|
||||
vld1.8 {d7[]}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
////Get the left column (8 byte)
|
||||
sub r2, r0, #1
|
||||
vld1.8 {d0[]}, [r2], r1
|
||||
vld1.8 {d1[]}, [r2], r1
|
||||
vld1.8 {d2[]}, [r2], r1
|
||||
vld1.8 {d3[]}, [r2], r1
|
||||
vld1.8 {d4[]}, [r2], r1
|
||||
vld1.8 {d5[]}, [r2], r1
|
||||
vld1.8 {d6[]}, [r2], r1
|
||||
vld1.8 {d7[]}, [r2]
|
||||
|
||||
//Set the chroma MB using left column data
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d1}, [r0], r1
|
||||
vst1.8 {d2}, [r0], r1
|
||||
vst1.8 {d3}, [r0], r1
|
||||
vst1.8 {d4}, [r0], r1
|
||||
vst1.8 {d5}, [r0], r1
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r0]
|
||||
//Set the chroma MB using left column data
|
||||
vst1.8 {d0}, [r0], r1
|
||||
vst1.8 {d1}, [r0], r1
|
||||
vst1.8 {d2}, [r0], r1
|
||||
vst1.8 {d3}, [r0], r1
|
||||
vst1.8 {d4}, [r0], r1
|
||||
vst1.8 {d5}, [r0], r1
|
||||
vst1.8 {d6}, [r0], r1
|
||||
vst1.8 {d7}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -576,73 +576,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823
|
||||
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsDecoderIChromaPredPlane_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data
|
||||
sub r2, r0, #1
|
||||
sub r2, r1
|
||||
vld1.32 {d1[0]}, [r2]
|
||||
add r2, #5
|
||||
vld1.32 {d0[0]}, [r2]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data
|
||||
sub r2, r0, #1
|
||||
sub r2, r1
|
||||
vld1.32 {d1[0]}, [r2]
|
||||
add r2, #5
|
||||
vld1.32 {d0[0]}, [r2]
|
||||
|
||||
//Load the left column data
|
||||
sub r2, #5
|
||||
vld1.8 {d1[4]}, [r2], r1
|
||||
vld1.8 {d1[5]}, [r2], r1
|
||||
vld1.8 {d1[6]}, [r2], r1
|
||||
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||
add r2, r1
|
||||
vld1.8 {d0[4]}, [r2], r1
|
||||
vld1.8 {d0[5]}, [r2], r1
|
||||
vld1.8 {d0[6]}, [r2], r1
|
||||
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
|
||||
//Load the left column data
|
||||
sub r2, #5
|
||||
vld1.8 {d1[4]}, [r2], r1
|
||||
vld1.8 {d1[5]}, [r2], r1
|
||||
vld1.8 {d1[6]}, [r2], r1
|
||||
vld1.8 {d1[7]}, [r2], r1 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||
add r2, r1
|
||||
vld1.8 {d0[4]}, [r2], r1
|
||||
vld1.8 {d0[5]}, [r2], r1
|
||||
vld1.8 {d0[6]}, [r2], r1
|
||||
vld1.8 {d0[7]}, [r2] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
|
||||
|
||||
|
||||
//Save T7 to d3 for next step
|
||||
vdup.u8 d3, d0[3]
|
||||
//Save L7 to d4 for next step
|
||||
vdup.u8 d4, d0[7]
|
||||
//Save T7 to d3 for next step
|
||||
vdup.u8 d3, d0[3]
|
||||
//Save L7 to d4 for next step
|
||||
vdup.u8 d4, d0[7]
|
||||
|
||||
//Calculate the value of 'a' and save to q2
|
||||
vaddl.u8 q2, d3, d4
|
||||
vshl.u16 q2, #4
|
||||
//Calculate the value of 'a' and save to q2
|
||||
vaddl.u8 q2, d3, d4
|
||||
vshl.u16 q2, #4
|
||||
|
||||
//Load the table {{1,2,3,4,1,2,3,4}*17}
|
||||
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d2}, [r2]
|
||||
//Load the table {{1,2,3,4,1,2,3,4}*17}
|
||||
adr r2, CONST0_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d2}, [r2]
|
||||
|
||||
//Calculate the 'b','c', and save to q0
|
||||
vrev32.8 d1, d1
|
||||
vsubl.u8 q0, d0, d1
|
||||
vmovl.u8 q1, d2
|
||||
vmul.s16 q0, q1
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
vrshr.s64 q0, #5
|
||||
//Calculate the 'b','c', and save to q0
|
||||
vrev32.8 d1, d1
|
||||
vsubl.u8 q0, d0, d1
|
||||
vmovl.u8 q1, d2
|
||||
vmul.s16 q0, q1
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
vrshr.s64 q0, #5
|
||||
|
||||
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
|
||||
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d6, d7}, [r2]
|
||||
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
|
||||
adr r2, CONST1_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d6, d7}, [r2]
|
||||
|
||||
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
|
||||
vdup.s16 q1, d1[0]
|
||||
vdup.s16 q0, d0[0]
|
||||
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
|
||||
vdup.s16 q1, d1[0]
|
||||
vdup.s16 q0, d0[0]
|
||||
|
||||
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
|
||||
vmla.s16 q2, q0, q3
|
||||
vmla.s16 q2, q1, d6[0]
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
|
||||
vmla.s16 q2, q0, q3
|
||||
vmla.s16 q2, q1, d6[0]
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
|
||||
//Set a line of chroma MB
|
||||
vst1.u32 {d0}, [r0], r1
|
||||
//Set a line of chroma MB
|
||||
vst1.u32 {d0}, [r0], r1
|
||||
|
||||
//Do the same processing for each line.
|
||||
mov r2, #7
|
||||
//Do the same processing for each line.
|
||||
mov r2, #7
|
||||
loop_0_get_i_chroma_pred_plane:
|
||||
vadd.s16 q2, q1
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
vst1.u32 {d0}, [r0], r1
|
||||
subs r2, #1
|
||||
bne loop_0_get_i_chroma_pred_plane
|
||||
vadd.s16 q2, q1
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
vst1.u32 {d0}, [r0], r1
|
||||
subs r2, #1
|
||||
bne loop_0_get_i_chroma_pred_plane
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
@ -54,7 +54,7 @@
|
||||
%endmacro
|
||||
|
||||
%macro MMX_SumSub 3
|
||||
movq %3, %2
|
||||
movq %3, %2
|
||||
psubw %2, %1
|
||||
paddw %1, %3
|
||||
%endmacro
|
||||
@ -62,8 +62,8 @@
|
||||
%macro MMX_IDCT 6
|
||||
MMX_SumSub %4, %5, %6
|
||||
MMX_SumSubDiv2 %3, %2, %1
|
||||
MMX_SumSub %1, %4, %6
|
||||
MMX_SumSub %3, %5, %6
|
||||
MMX_SumSub %1, %4, %6
|
||||
MMX_SumSub %3, %5, %6
|
||||
%endmacro
|
||||
|
||||
|
||||
@ -96,13 +96,13 @@ WELS_EXTERN IdctResAddPred_mmx
|
||||
movq mm2, [r2+16]
|
||||
movq mm3, [r2+24]
|
||||
|
||||
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
|
||||
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
|
||||
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
|
||||
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
|
||||
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
|
||||
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
|
||||
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
|
||||
|
||||
WELS_Zero mm7
|
||||
WELS_DW32 mm6
|
||||
WELS_Zero mm7
|
||||
WELS_DW32 mm6
|
||||
|
||||
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0]
|
||||
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1]
|
||||
@ -111,5 +111,5 @@ WELS_EXTERN IdctResAddPred_mmx
|
||||
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1]
|
||||
|
||||
|
||||
emms
|
||||
emms
|
||||
ret
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -38,107 +38,107 @@
|
||||
#ifdef __APPLE__
|
||||
//Global macro
|
||||
.macro GET_8BYTE_DATA
|
||||
vld1.8 {$0[0]}, [$1], $2
|
||||
vld1.8 {$0[1]}, [$1], $2
|
||||
vld1.8 {$0[2]}, [$1], $2
|
||||
vld1.8 {$0[3]}, [$1], $2
|
||||
vld1.8 {$0[4]}, [$1], $2
|
||||
vld1.8 {$0[5]}, [$1], $2
|
||||
vld1.8 {$0[6]}, [$1], $2
|
||||
vld1.8 {$0[7]}, [$1], $2
|
||||
vld1.8 {$0[0]}, [$1], $2
|
||||
vld1.8 {$0[1]}, [$1], $2
|
||||
vld1.8 {$0[2]}, [$1], $2
|
||||
vld1.8 {$0[3]}, [$1], $2
|
||||
vld1.8 {$0[4]}, [$1], $2
|
||||
vld1.8 {$0[5]}, [$1], $2
|
||||
vld1.8 {$0[6]}, [$1], $2
|
||||
vld1.8 {$0[7]}, [$1], $2
|
||||
.endm
|
||||
#else
|
||||
//Global macro
|
||||
.macro GET_8BYTE_DATA arg0, arg1, arg2
|
||||
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[2]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[3]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[4]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[5]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[6]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[7]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[0]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[1]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[2]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[3]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[4]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[5]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[6]}, [\arg1], \arg2
|
||||
vld1.8 {\arg0[7]}, [\arg1], \arg2
|
||||
.endm
|
||||
#endif
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredV_neon
|
||||
//Get the top line data to 'q0'
|
||||
sub r3, r1, r2
|
||||
vldm r3, {d0, d1}
|
||||
//Get the top line data to 'q0'
|
||||
sub r3, r1, r2
|
||||
vldm r3, {d0, d1}
|
||||
|
||||
//mov r2, #16
|
||||
mov r3, #4
|
||||
//Set the top line to the each line of MB(16*16)
|
||||
//mov r2, #16
|
||||
mov r3, #4
|
||||
//Set the top line to the each line of MB(16*16)
|
||||
loop_0_get_i16x16_luma_pred_v:
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_v
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_v
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredH_neon
|
||||
//stmdb sp!, {r4, lr}
|
||||
sub r1, r1, #1
|
||||
mov r3, #4
|
||||
sub r1, r1, #1
|
||||
mov r3, #4
|
||||
loop_0_get_i16x16_luma_pred_h:
|
||||
//Get one byte data from left side
|
||||
vld1.8 {d0[],d1[]}, [r1], r2
|
||||
vld1.8 {d2[],d3[]}, [r1], r2
|
||||
vld1.8 {d4[],d5[]}, [r1], r2
|
||||
vld1.8 {d6[],d7[]}, [r1], r2
|
||||
//Get one byte data from left side
|
||||
vld1.8 {d0[],d1[]}, [r1], r2
|
||||
vld1.8 {d2[],d3[]}, [r1], r2
|
||||
vld1.8 {d4[],d5[]}, [r1], r2
|
||||
vld1.8 {d6[],d7[]}, [r1], r2
|
||||
|
||||
//Set the line of MB using the left side byte data
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
//add r0, #16
|
||||
vst1.8 {d2,d3}, [r0]!
|
||||
//add r0, #16
|
||||
vst1.8 {d4,d5}, [r0]!
|
||||
//add r0, #16
|
||||
vst1.8 {d6,d7}, [r0]!
|
||||
//add r0, #16
|
||||
//Set the line of MB using the left side byte data
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
//add r0, #16
|
||||
vst1.8 {d2,d3}, [r0]!
|
||||
//add r0, #16
|
||||
vst1.8 {d4,d5}, [r0]!
|
||||
//add r0, #16
|
||||
vst1.8 {d6,d7}, [r0]!
|
||||
//add r0, #16
|
||||
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_h
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_h
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredDc_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the left vertical line data
|
||||
sub r3, r1, #1
|
||||
GET_8BYTE_DATA d0, r3, r2
|
||||
GET_8BYTE_DATA d1, r3, r2
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the left vertical line data
|
||||
sub r3, r1, #1
|
||||
GET_8BYTE_DATA d0, r3, r2
|
||||
GET_8BYTE_DATA d1, r3, r2
|
||||
|
||||
//Get the top horizontal line data
|
||||
sub r3, r1, r2
|
||||
vldm r3, {d2, d3}
|
||||
//Get the top horizontal line data
|
||||
sub r3, r1, r2
|
||||
vldm r3, {d2, d3}
|
||||
|
||||
//Calculate the sum of top horizontal line data and vertical line data
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
//Calculate the sum of top horizontal line data and vertical line data
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vadd.u16 q0, q0, q1
|
||||
vadd.u16 d0, d0, d1
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
//Calculate the mean value
|
||||
vrshr.u16 d0, d0, #5
|
||||
vdup.8 q0, d0[0]
|
||||
|
||||
//Set the mean value to the all of member of MB
|
||||
mov r3, #4
|
||||
//Set the mean value to the all of member of MB
|
||||
mov r3, #4
|
||||
loop_0_get_i16x16_luma_pred_dc_both:
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
vst1.8 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_dc_both
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -151,383 +151,383 @@ CONST1_GET_I16X16_LUMA_PRED_PLANE: .long 0xfcfbfaf9, 0x00fffefd
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI16x16LumaPredPlane_neon
|
||||
//stmdb sp!, { r4, lr}
|
||||
//stmdb sp!, { r4, lr}
|
||||
|
||||
//Load the table {(8,7,6,5,4,3,2,1) * 5}
|
||||
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
|
||||
vldr d0, [r3]
|
||||
//Load the table {(8,7,6,5,4,3,2,1) * 5}
|
||||
adr r3, CONST0_GET_I16X16_LUMA_PRED_PLANE
|
||||
vldr d0, [r3]
|
||||
|
||||
//Pack the top[-1] ~ top[6] to d1
|
||||
sub r3, r1, r2
|
||||
sub r1, r3, #1
|
||||
vld1.8 d1, [r1]
|
||||
//Pack the top[-1] ~ top[6] to d1
|
||||
sub r3, r1, r2
|
||||
sub r1, r3, #1
|
||||
vld1.8 d1, [r1]
|
||||
|
||||
//Pack the top[8] ~ top[15] to d2
|
||||
add r1, #9
|
||||
vld1.8 d2, [r1]
|
||||
//Pack the top[8] ~ top[15] to d2
|
||||
add r1, #9
|
||||
vld1.8 d2, [r1]
|
||||
|
||||
//Save the top[15] to d6 for next step
|
||||
vdup.u8 d6, d2[7]
|
||||
//Save the top[15] to d6 for next step
|
||||
vdup.u8 d6, d2[7]
|
||||
|
||||
//Get and pack left[-1] ~ left[6] to d4
|
||||
sub r1, r3, #1
|
||||
GET_8BYTE_DATA d4, r1, r2
|
||||
//Get and pack left[-1] ~ left[6] to d4
|
||||
sub r1, r3, #1
|
||||
GET_8BYTE_DATA d4, r1, r2
|
||||
|
||||
//Get and pack left[8] ~ left[15] to d3
|
||||
add r1, r2
|
||||
GET_8BYTE_DATA d3, r1, r2
|
||||
//Get and pack left[8] ~ left[15] to d3
|
||||
add r1, r2
|
||||
GET_8BYTE_DATA d3, r1, r2
|
||||
|
||||
//Save the left[15] to d7 for next step
|
||||
vdup.u8 d7, d3[7]
|
||||
//Save the left[15] to d7 for next step
|
||||
vdup.u8 d7, d3[7]
|
||||
|
||||
//revert the sequence of d2,d3
|
||||
vrev64.8 q1, q1
|
||||
//revert the sequence of d2,d3
|
||||
vrev64.8 q1, q1
|
||||
|
||||
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
|
||||
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
|
||||
vsubl.u8 q2, d3, d4 //q2={left[8]-left[6],left[9]-left[5],left[10]-left[4], ...}
|
||||
vsubl.u8 q1, d2, d1 //q1={top[8]-top[6],top[9]-top[5],top[10]-top[4], ...}
|
||||
|
||||
|
||||
vmovl.u8 q0, d0
|
||||
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmovl.u8 q0, d0
|
||||
vmul.s16 q1, q0, q1 //q1 = q1*{(8,7,6,5,4,3,2,1) * 5}
|
||||
vmul.s16 q2, q0, q2 //q2 = q2*{(8,7,6,5,4,3,2,1) * 5}
|
||||
|
||||
//Calculate the sum of items of q1, q2
|
||||
vpadd.s16 d0, d2, d3
|
||||
vpadd.s16 d1, d4, d5
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
//Calculate the sum of items of q1, q2
|
||||
vpadd.s16 d0, d2, d3
|
||||
vpadd.s16 d1, d4, d5
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
|
||||
//Get the value of 'b', 'c' and extend to q1, q2.
|
||||
vrshr.s64 q0, #6
|
||||
vdup.s16 q1, d0[0]
|
||||
vdup.s16 q2, d1[0]
|
||||
//Get the value of 'b', 'c' and extend to q1, q2.
|
||||
vrshr.s64 q0, #6
|
||||
vdup.s16 q1, d0[0]
|
||||
vdup.s16 q2, d1[0]
|
||||
|
||||
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
|
||||
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
|
||||
vld1.32 {d0}, [r3]
|
||||
//Load the table {-7,-6,-5,-4,-3,-2,-1,0} to d0
|
||||
adr r3, CONST1_GET_I16X16_LUMA_PRED_PLANE
|
||||
vld1.32 {d0}, [r3]
|
||||
|
||||
//Get the value of 'a' and save to q3
|
||||
vaddl.u8 q3, d6, d7
|
||||
vshl.u16 q3, #4
|
||||
//Get the value of 'a' and save to q3
|
||||
vaddl.u8 q3, d6, d7
|
||||
vshl.u16 q3, #4
|
||||
|
||||
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
|
||||
vmovl.s8 q0, d0
|
||||
vmla.s16 q3, q0, q1
|
||||
vmla.s16 q3, q2, d0[0]
|
||||
//calculate a+'b'*{-7,-6,-5,-4,-3,-2,-1,0} + c*{-7}
|
||||
vmovl.s8 q0, d0
|
||||
vmla.s16 q3, q0, q1
|
||||
vmla.s16 q3, q2, d0[0]
|
||||
|
||||
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
|
||||
vshl.s16 q8, q1, #3
|
||||
vadd.s16 q8, q3
|
||||
//Calculate a+'b'*{1,2,3,4,5,6,7,8} + c*{-7}
|
||||
vshl.s16 q8, q1, #3
|
||||
vadd.s16 q8, q3
|
||||
|
||||
//right shift 5 bits and rounding
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
//right shift 5 bits and rounding
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
|
||||
//Set the line of MB
|
||||
vst1.u32 {d0,d1}, [r0]!
|
||||
//Set the line of MB
|
||||
vst1.u32 {d0,d1}, [r0]!
|
||||
|
||||
|
||||
//Do the same processing for setting other lines
|
||||
mov r3, #15
|
||||
//Do the same processing for setting other lines
|
||||
mov r3, #15
|
||||
loop_0_get_i16x16_luma_pred_plane:
|
||||
vadd.s16 q3, q2
|
||||
vadd.s16 q8, q2
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
vst1.u32 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_plane
|
||||
vadd.s16 q3, q2
|
||||
vadd.s16 q8, q2
|
||||
vqrshrun.s16 d0, q3, #5
|
||||
vqrshrun.s16 d1, q8, #5
|
||||
vst1.u32 {d0,d1}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i16x16_luma_pred_plane
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredV_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
ldr r3, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
ldr r3, [r3]
|
||||
|
||||
//Set the luma MB using top line
|
||||
str r3, [r0], #4
|
||||
str r3, [r0], #4
|
||||
str r3, [r0], #4
|
||||
str r3, [r0]
|
||||
//Set the luma MB using top line
|
||||
str r3, [r0], #4
|
||||
str r3, [r0], #4
|
||||
str r3, [r0], #4
|
||||
str r3, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredH_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column (4 bytes)
|
||||
sub r3, r1, #1
|
||||
vld1.8 {d0[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d3[]}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the left column (4 bytes)
|
||||
sub r3, r1, #1
|
||||
vld1.8 {d0[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d3[]}, [r3]
|
||||
|
||||
//Set the luma MB using the left side byte
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
vst1.32 {d1[0]}, [r0]!
|
||||
vst1.32 {d2[0]}, [r0]!
|
||||
vst1.32 {d3[0]}, [r0]
|
||||
//Set the luma MB using the left side byte
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
vst1.32 {d1[0]}, [r0]!
|
||||
vst1.32 {d2[0]}, [r0]!
|
||||
vst1.32 {d3[0]}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDL_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data(8 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data(8 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0}, [r3]
|
||||
|
||||
//For "t7 + (t7<<1)"
|
||||
vdup.8 d1, d0[7]
|
||||
//For "t7 + (t7<<1)"
|
||||
vdup.8 d1, d0[7]
|
||||
|
||||
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
|
||||
vext.8 d1, d0, d1, #1
|
||||
vaddl.u8 q1, d1, d0
|
||||
//calculate "t0+t1,t1+t2,t2+t3...t6+t7,t7+t7"
|
||||
vext.8 d1, d0, d1, #1
|
||||
vaddl.u8 q1, d1, d0
|
||||
|
||||
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
|
||||
vext.8 q2, q1, q1, #14
|
||||
vadd.u16 q0, q1, q2
|
||||
//calculate "x,t0+t1+t1+t2,t1+t2+t2+t3,...t5+t6+t6+t7,t6+t7+t7+t7"
|
||||
vext.8 q2, q1, q1, #14
|
||||
vadd.u16 q0, q1, q2
|
||||
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q0, #2
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q0, #2
|
||||
|
||||
//Save "ddl0, ddl1, ddl2, ddl3"
|
||||
vext.8 d1, d0, d0, #1
|
||||
vst1.32 d1[0], [r0]!
|
||||
//Save "ddl0, ddl1, ddl2, ddl3"
|
||||
vext.8 d1, d0, d0, #1
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
//Save "ddl1, ddl2, ddl3, ddl4"
|
||||
vext.8 d1, d0, d0, #2
|
||||
vst1.32 d1[0], [r0]!
|
||||
//Save "ddl1, ddl2, ddl3, ddl4"
|
||||
vext.8 d1, d0, d0, #2
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
//Save "ddl2, ddl3, ddl4, ddl5"
|
||||
vext.8 d1, d0, d0, #3
|
||||
vst1.32 d1[0], [r0]!
|
||||
//Save "ddl2, ddl3, ddl4, ddl5"
|
||||
vext.8 d1, d0, d0, #3
|
||||
vst1.32 d1[0], [r0]!
|
||||
|
||||
//Save "ddl3, ddl4, ddl5, ddl6"
|
||||
vst1.32 d0[1], [r0]
|
||||
//Save "ddl3, ddl4, ddl5, ddl6"
|
||||
vst1.32 d0[1], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredDDR_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0[1]}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0[1]}, [r3]
|
||||
|
||||
//Load the left column (5 bytes)
|
||||
sub r3, #1
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3], r2
|
||||
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
|
||||
//Load the left column (5 bytes)
|
||||
sub r3, #1
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3], r2
|
||||
vld1.8 {d1[7]}, [r3] //For packing the right sequence to do SIMD processing
|
||||
|
||||
|
||||
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
|
||||
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
vext.8 d2, d1, d0, #7 //d0:{L2,L1,L0,LT,T0,T1,T2,T3}
|
||||
//d2:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
|
||||
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
|
||||
vaddl.u8 q2, d2, d0
|
||||
//q2:{L2+L3,L1+L2,L0+L1...T1+T2,T2+T3}
|
||||
vaddl.u8 q2, d2, d0
|
||||
|
||||
//q1:{TL0+LT0,LT0+T01,...L12+L23}
|
||||
vext.8 q3, q3, q2, #14
|
||||
vadd.u16 q1, q2, q3
|
||||
//q1:{TL0+LT0,LT0+T01,...L12+L23}
|
||||
vext.8 q3, q3, q2, #14
|
||||
vadd.u16 q1, q2, q3
|
||||
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q1, #2
|
||||
//right shift 2 bits and rounding
|
||||
vqrshrn.u16 d0, q1, #2
|
||||
|
||||
//Adjust the data sequence for setting luma MB of 'pred'
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]
|
||||
//Adjust the data sequence for setting luma MB of 'pred'
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]!
|
||||
vext.8 d0, d0, d0, #7
|
||||
vst1.32 d0[1], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVL_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (8 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (8 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0}, [r3]
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q1, d1, d0 //q1:{t0+t1,t1+t2,t2+t3...t5+t6,x,x}
|
||||
|
||||
vext.8 q2, q1, q1, #2
|
||||
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
|
||||
vext.8 q2, q1, q1, #2
|
||||
vadd.u16 q2, q1, q2 //q2:{t0+t1+t1+t2,t1+t2+t2+t3,...t4+t5+t5+t6,x,x}
|
||||
|
||||
//calculate the "vl0,vl1,vl2,vl3,vl4"
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
//calculate the "vl0,vl1,vl2,vl3,vl4"
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
|
||||
//calculate the "vl5,vl6,vl7,vl8,vl9"
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
//calculate the "vl5,vl6,vl7,vl8,vl9"
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[0], [r0]!
|
||||
vst1.32 d1[0], [r0]!
|
||||
vext.8 d0, d0, d0, #1
|
||||
vext.8 d1, d1, d1, #1
|
||||
vst1.32 d0[0], [r0]!
|
||||
vst1.32 d1[0], [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[0], [r0]!
|
||||
vst1.32 d1[0], [r0]!
|
||||
vext.8 d0, d0, d0, #1
|
||||
vext.8 d1, d1, d1, #1
|
||||
vst1.32 d0[0], [r0]!
|
||||
vst1.32 d1[0], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredVR_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0[1]}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row (4 bytes)
|
||||
sub r3, r1, r2
|
||||
vld1.32 {d0[1]}, [r3]
|
||||
|
||||
//Load the left column (4 bytes)
|
||||
sub r3, #1
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
//Load the left column (4 bytes)
|
||||
sub r3, #1
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{X,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2,T2+T3}
|
||||
|
||||
vext.u8 q2, q1, q1, #14
|
||||
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
|
||||
vext.u8 q2, q1, q1, #14
|
||||
vadd.u16 q2, q2, q1 //q2:{X,L2+L1+L1+L0,L1+L0+L0+LT,...T1+T2+T2+T3}
|
||||
|
||||
//Calculate the vr0 ~ vr9
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
//Calculate the vr0 ~ vr9
|
||||
vqrshrn.u16 d1, q2, #2
|
||||
vqrshrn.u16 d0, q1, #1
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[1], [r0]!
|
||||
vst1.32 d1[1], [r0]!
|
||||
//add r2, r0, r1
|
||||
vst1.8 d1[3], [r0]!
|
||||
vst1.16 d0[2], [r0]!
|
||||
vst1.8 d0[6], [r0]!
|
||||
vst1.8 d1[2], [r0]!
|
||||
vst1.16 d1[2], [r0]!
|
||||
vst1.8 d1[6], [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vst1.32 d0[1], [r0]!
|
||||
vst1.32 d1[1], [r0]!
|
||||
//add r2, r0, r1
|
||||
vst1.8 d1[3], [r0]!
|
||||
vst1.16 d0[2], [r0]!
|
||||
vst1.8 d0[6], [r0]!
|
||||
vst1.8 d1[2], [r0]!
|
||||
vst1.16 d1[2], [r0]!
|
||||
vst1.8 d1[6], [r0]
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHU_neon
|
||||
//stmdb sp!, { r4, lr}
|
||||
//Load the left column data
|
||||
sub r3, r1, #1
|
||||
mov r1, #3
|
||||
mul r1, r2
|
||||
add r1, r3
|
||||
vld1.8 {d0[]}, [r1]
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[5]}, [r3], r2
|
||||
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||
//stmdb sp!, { r4, lr}
|
||||
//Load the left column data
|
||||
sub r3, r1, #1
|
||||
mov r1, #3
|
||||
mul r1, r2
|
||||
add r1, r3
|
||||
vld1.8 {d0[]}, [r1]
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[5]}, [r3], r2
|
||||
vld1.8 {d0[6]}, [r3], r2 //d0:{L3,L3,L3,L3,L0,L1,L2,L3}
|
||||
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||
vext.8 d1, d0, d0, #1
|
||||
vaddl.u8 q2, d0, d1 //q2:{L3+L3,L3+L3,L3+L3,L3+L0,L0+L1,L1+L2,L2+L3,L3+L3}
|
||||
|
||||
vext.u8 d2, d5, d4, #2
|
||||
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||
vext.u8 d2, d5, d4, #2
|
||||
vadd.u16 d3, d2, d5 //d3:{L0+L1+L1+L2,L1+L2+L2+L3,L2+L3+L3+L3,L3+L3+L3+L3}
|
||||
|
||||
//Calculate the hu0 ~ hu5
|
||||
vqrshrn.u16 d2, q2, #1
|
||||
vqrshrn.u16 d1, q1, #2
|
||||
//Calculate the hu0 ~ hu5
|
||||
vqrshrn.u16 d2, q2, #1
|
||||
vqrshrn.u16 d1, q1, #2
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vzip.8 d2, d1
|
||||
vst1.32 d1[0], [r0]!
|
||||
vext.8 d2, d1, d1, #2
|
||||
vst1.32 d2[0], [r0]!
|
||||
vst1.32 d1[1], [r0]!
|
||||
vst1.32 d0[0], [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vzip.8 d2, d1
|
||||
vst1.32 d1[0], [r0]!
|
||||
vext.8 d2, d1, d1, #2
|
||||
vst1.32 d2[0], [r0]!
|
||||
vst1.32 d1[1], [r0]!
|
||||
vst1.32 d0[0], [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsI4x4LumaPredHD_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the data
|
||||
sub r3, r1, r2
|
||||
sub r3, #1
|
||||
vld1.32 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the data
|
||||
sub r3, r1, r2
|
||||
sub r3, #1
|
||||
vld1.32 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[3]}, [r3], r2
|
||||
vld1.8 {d0[2]}, [r3], r2
|
||||
vld1.8 {d0[1]}, [r3], r2
|
||||
vld1.8 {d0[0]}, [r3] //d0:{L3,L2,L1,L0,LT,T0,T1,T2}
|
||||
|
||||
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
|
||||
vext.8 d1, d0, d0, #7
|
||||
vaddl.u8 q1, d0, d1 //q1:{x,L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1,T1+T2}
|
||||
|
||||
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
|
||||
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
|
||||
vext.u8 q2, q1, q1, #14 //q2:{x,x, L3+L2,L2+L1,L1+L0,L0+LT,LT+T0,T0+T1}
|
||||
vadd.u16 q3, q2, q1 //q3:{x,x,L3+L2+L2+L1,L2+L1+L1+L0,L1+L0+L0+LT,L0+LT+LT+T0,LT+T0+T0+T1,T0+T1+T1+T2}
|
||||
|
||||
//Calculate the hd0~hd9
|
||||
vqrshrn.u16 d1, q3, #2
|
||||
vqrshrn.u16 d0, q2, #1
|
||||
//Calculate the hd0~hd9
|
||||
vqrshrn.u16 d1, q3, #2
|
||||
vqrshrn.u16 d0, q2, #1
|
||||
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vmov d3, d1
|
||||
vtrn.8 d0, d1
|
||||
vext.u8 d2, d1, d1, #6
|
||||
vst2.16 {d2[3], d3[3]}, [r0]!
|
||||
vst2.16 {d0[2], d1[2]}, [r0]!
|
||||
vmov d3, d0
|
||||
vst2.16 {d2[2], d3[2]}, [r0]!
|
||||
vst2.16 {d0[1], d1[1]}, [r0]
|
||||
//Adjust the data sequence for setting the luma MB
|
||||
vmov d3, d1
|
||||
vtrn.8 d0, d1
|
||||
vext.u8 d2, d1, d1, #6
|
||||
vst2.16 {d2[3], d3[3]}, [r0]!
|
||||
vst2.16 {d0[2], d1[2]}, [r0]!
|
||||
vmov d3, d0
|
||||
vst2.16 {d2[2], d3[2]}, [r0]!
|
||||
vst2.16 {d0[1], d1[1]}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsIChromaPredV_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the top row (8 byte)
|
||||
sub r3, r1, r2
|
||||
vldr d0, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Get the top row (8 byte)
|
||||
sub r3, r1, r2
|
||||
vldr d0, [r3]
|
||||
|
||||
//Set the chroma MB using top row data
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]
|
||||
//Set the chroma MB using top row data
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d0}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsIChromaPredH_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
////Get the left column (8 byte)
|
||||
sub r3, r1, #1
|
||||
vld1.8 {d0[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d3[]}, [r3], r2
|
||||
vld1.8 {d4[]}, [r3], r2
|
||||
vld1.8 {d5[]}, [r3], r2
|
||||
vld1.8 {d6[]}, [r3], r2
|
||||
vld1.8 {d7[]}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
////Get the left column (8 byte)
|
||||
sub r3, r1, #1
|
||||
vld1.8 {d0[]}, [r3], r2
|
||||
vld1.8 {d1[]}, [r3], r2
|
||||
vld1.8 {d2[]}, [r3], r2
|
||||
vld1.8 {d3[]}, [r3], r2
|
||||
vld1.8 {d4[]}, [r3], r2
|
||||
vld1.8 {d5[]}, [r3], r2
|
||||
vld1.8 {d6[]}, [r3], r2
|
||||
vld1.8 {d7[]}, [r3]
|
||||
|
||||
//Set the chroma MB using left column data
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d1}, [r0]!
|
||||
vst1.8 {d2}, [r0]!
|
||||
vst1.8 {d3}, [r0]!
|
||||
vst1.8 {d4}, [r0]!
|
||||
vst1.8 {d5}, [r0]!
|
||||
vst1.8 {d6}, [r0]!
|
||||
vst1.8 {d7}, [r0]
|
||||
//Set the chroma MB using left column data
|
||||
vst1.8 {d0}, [r0]!
|
||||
vst1.8 {d1}, [r0]!
|
||||
vst1.8 {d2}, [r0]!
|
||||
vst1.8 {d3}, [r0]!
|
||||
vst1.8 {d4}, [r0]!
|
||||
vst1.8 {d5}, [r0]!
|
||||
vst1.8 {d6}, [r0]!
|
||||
vst1.8 {d7}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -575,73 +575,73 @@ CONST0_GET_I_CHROMA_PRED_PLANE: .long 0x44332211, 0x44332211//0x140f0a05, 0x2823
|
||||
CONST1_GET_I_CHROMA_PRED_PLANE: .long 0xfffefffd, 0x0000ffff,0x00020001,0x00040003
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsIChromaPredPlane_neon
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data
|
||||
sub r3, r1, #1
|
||||
sub r3, r2
|
||||
vld1.32 {d1[0]}, [r3]
|
||||
add r3, #5
|
||||
vld1.32 {d0[0]}, [r3]
|
||||
//stmdb sp!, { r2-r5, lr}
|
||||
//Load the top row data
|
||||
sub r3, r1, #1
|
||||
sub r3, r2
|
||||
vld1.32 {d1[0]}, [r3]
|
||||
add r3, #5
|
||||
vld1.32 {d0[0]}, [r3]
|
||||
|
||||
//Load the left column data
|
||||
sub r3, #5
|
||||
vld1.8 {d1[4]}, [r3], r2
|
||||
vld1.8 {d1[5]}, [r3], r2
|
||||
vld1.8 {d1[6]}, [r3], r2
|
||||
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||
add r3, r2
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[5]}, [r3], r2
|
||||
vld1.8 {d0[6]}, [r3], r2
|
||||
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
|
||||
//Load the left column data
|
||||
sub r3, #5
|
||||
vld1.8 {d1[4]}, [r3], r2
|
||||
vld1.8 {d1[5]}, [r3], r2
|
||||
vld1.8 {d1[6]}, [r3], r2
|
||||
vld1.8 {d1[7]}, [r3], r2 //d1:{LT,T0,T1,T2,LT,L0,L1,L2}
|
||||
add r3, r2
|
||||
vld1.8 {d0[4]}, [r3], r2
|
||||
vld1.8 {d0[5]}, [r3], r2
|
||||
vld1.8 {d0[6]}, [r3], r2
|
||||
vld1.8 {d0[7]}, [r3] //d0:{T4,T5,T6,T7,L4,L5,L6.L7}
|
||||
|
||||
|
||||
//Save T7 to d3 for next step
|
||||
vdup.u8 d3, d0[3]
|
||||
//Save L7 to d4 for next step
|
||||
vdup.u8 d4, d0[7]
|
||||
//Save T7 to d3 for next step
|
||||
vdup.u8 d3, d0[3]
|
||||
//Save L7 to d4 for next step
|
||||
vdup.u8 d4, d0[7]
|
||||
|
||||
//Calculate the value of 'a' and save to q2
|
||||
vaddl.u8 q2, d3, d4
|
||||
vshl.u16 q2, #4
|
||||
//Calculate the value of 'a' and save to q2
|
||||
vaddl.u8 q2, d3, d4
|
||||
vshl.u16 q2, #4
|
||||
|
||||
//Load the table {{1,2,3,4,1,2,3,4}*17}
|
||||
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d2}, [r3]
|
||||
//Load the table {{1,2,3,4,1,2,3,4}*17}
|
||||
adr r3, CONST0_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d2}, [r3]
|
||||
|
||||
//Calculate the 'b','c', and save to q0
|
||||
vrev32.8 d1, d1
|
||||
vsubl.u8 q0, d0, d1
|
||||
vmovl.u8 q1, d2
|
||||
vmul.s16 q0, q1
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
vrshr.s64 q0, #5
|
||||
//Calculate the 'b','c', and save to q0
|
||||
vrev32.8 d1, d1
|
||||
vsubl.u8 q0, d0, d1
|
||||
vmovl.u8 q1, d2
|
||||
vmul.s16 q0, q1
|
||||
vpaddl.s16 q0, q0
|
||||
vpaddl.s32 q0, q0
|
||||
vrshr.s64 q0, #5
|
||||
|
||||
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
|
||||
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d6, d7}, [r3]
|
||||
//Load the table {-3,-2,-1,0,1,2,3,4} to q3
|
||||
adr r3, CONST1_GET_I_CHROMA_PRED_PLANE
|
||||
vld1.32 {d6, d7}, [r3]
|
||||
|
||||
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
|
||||
vdup.s16 q1, d1[0]
|
||||
vdup.s16 q0, d0[0]
|
||||
//Duplicate the 'b','c' to q0, q1 for SIMD instruction
|
||||
vdup.s16 q1, d1[0]
|
||||
vdup.s16 q0, d0[0]
|
||||
|
||||
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
|
||||
vmla.s16 q2, q0, q3
|
||||
vmla.s16 q2, q1, d6[0]
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
//Calculate the "(a + b * (j - 3) + c * (- 3) + 16) >> 5;"
|
||||
vmla.s16 q2, q0, q3
|
||||
vmla.s16 q2, q1, d6[0]
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
|
||||
//Set a line of chroma MB
|
||||
vst1.u32 {d0}, [r0]!
|
||||
//Set a line of chroma MB
|
||||
vst1.u32 {d0}, [r0]!
|
||||
|
||||
//Do the same processing for each line.
|
||||
mov r3, #7
|
||||
//Do the same processing for each line.
|
||||
mov r3, #7
|
||||
loop_0_get_i_chroma_pred_plane:
|
||||
vadd.s16 q2, q1
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
vst1.u32 {d0}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i_chroma_pred_plane
|
||||
vadd.s16 q2, q1
|
||||
vqrshrun.s16 d0, q2, #5
|
||||
vst1.u32 {d0}, [r0]!
|
||||
subs r3, #1
|
||||
bne loop_0_get_i_chroma_pred_plane
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -66,10 +66,10 @@
|
||||
vsub.s16 q3, q12, q13
|
||||
|
||||
vadd.s16 q8, q10, q11
|
||||
vsub.s16 q9, q10, q11
|
||||
vsub.s16 q9, q10, q11
|
||||
|
||||
vadd.s16 q10, q14, q15
|
||||
vsub.s16 q11, q14, q15
|
||||
vsub.s16 q11, q14, q15
|
||||
|
||||
vadd.s16 q12, q0, q2
|
||||
vsub.s16 q14, q0, q2
|
||||
@ -372,28 +372,28 @@ WELS_ASM_FUNC_END
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSad4x4_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Loading a horizontal line data (4 bytes)
|
||||
//line 0
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usad8 lr, r4, r5
|
||||
//Loading a horizontal line data (4 bytes)
|
||||
//line 0
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usad8 lr, r4, r5
|
||||
|
||||
//line 1
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usada8 lr, r4, r5, lr
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usada8 lr, r4, r5, lr
|
||||
|
||||
//line 2
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usada8 lr, r4, r5, lr
|
||||
ldr r4, [r0], r1
|
||||
ldr r5, [r2], r3
|
||||
usada8 lr, r4, r5, lr
|
||||
|
||||
//line 3
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r2]
|
||||
usada8 r0, r4, r5, lr
|
||||
//line 3
|
||||
ldr r4, [r0]
|
||||
ldr r5, [r2]
|
||||
usada8 r0, r4, r5, lr
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
@ -401,340 +401,340 @@ WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x16_neon
|
||||
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
//Loading a horizontal line data (16 bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
|
||||
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {q10}, [r2], r3 //save pix2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {q10}, [r2], r3 //save pix2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 16 bytes
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabal.u8 q15, d1, d3
|
||||
//Do the SAD for 16 bytes
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabal.u8 q15, d1, d3
|
||||
|
||||
vabdl.u8 q13, d0, d4
|
||||
vabal.u8 q13, d1, d5
|
||||
vabdl.u8 q13, d0, d4
|
||||
vabal.u8 q13, d1, d5
|
||||
|
||||
vabdl.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
vabdl.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
vabdl.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
vabdl.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
|
||||
mov lr, #15
|
||||
mov lr, #15
|
||||
pixel_sad_4_16x16_loop_0:
|
||||
|
||||
//Loading a horizontal line data (16 bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
vmov.8 q1, q10 //save pix2 - stride
|
||||
vmov.8 q10, q2
|
||||
vabal.u8 q15, d0, d2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
vabal.u8 q15, d1, d3
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q13, d0, d4
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
vmov.8 q1, q10 //save pix2 - stride
|
||||
vmov.8 q10, q2
|
||||
vabal.u8 q15, d0, d2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
vabal.u8 q15, d1, d3
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q13, d0, d4
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
vabal.u8 q13, d1, d5
|
||||
subs lr, #1
|
||||
subs lr, #1
|
||||
|
||||
vabal.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
vabal.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
vabal.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
vabal.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
|
||||
bne pixel_sad_4_16x16_loop_0
|
||||
bne pixel_sad_4_16x16_loop_0
|
||||
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d26, d27
|
||||
vadd.u16 d2, d22, d23
|
||||
vadd.u16 d3, d18, d19
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d26, d27
|
||||
vadd.u16 d2, d22, d23
|
||||
vadd.u16 d3, d18, d19
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour16x8_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
//Loading a horizontal line data (16 bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
|
||||
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {q10}, [r2], r3 //save pix2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {q1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {q10}, [r2], r3 //save pix2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 16 bytes
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabal.u8 q15, d1, d3
|
||||
//Do the SAD for 16 bytes
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabal.u8 q15, d1, d3
|
||||
|
||||
vabdl.u8 q13, d0, d4
|
||||
vabal.u8 q13, d1, d5
|
||||
vabdl.u8 q13, d0, d4
|
||||
vabal.u8 q13, d1, d5
|
||||
|
||||
vabdl.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
vabdl.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
vabdl.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
vabdl.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
|
||||
mov lr, #7
|
||||
mov lr, #7
|
||||
pixel_sad_4_16x8_loop_0:
|
||||
|
||||
//Loading a horizontal line data (16 bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
vmov.8 q1, q10 //save pix2 - stride
|
||||
vmov.8 q10, q2
|
||||
vabal.u8 q15, d0, d2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
vabal.u8 q15, d1, d3
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q13, d0, d4
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {q0}, [r0], r1 //save pix1
|
||||
vmov.8 q1, q10 //save pix2 - stride
|
||||
vmov.8 q10, q2
|
||||
vabal.u8 q15, d0, d2
|
||||
vld1.8 {q2}, [r2], r3 //save pix2 + stride
|
||||
vabal.u8 q15, d1, d3
|
||||
vld1.8 {q3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q13, d0, d4
|
||||
vld1.8 {q8}, [r5], r3 //save pix2 + 1
|
||||
vabal.u8 q13, d1, d5
|
||||
subs lr, #1
|
||||
subs lr, #1
|
||||
|
||||
vabal.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
vabal.u8 q11, d0, d6
|
||||
vabal.u8 q11, d1, d7
|
||||
|
||||
vabal.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
vabal.u8 q9, d0, d16
|
||||
vabal.u8 q9, d1, d17
|
||||
|
||||
bne pixel_sad_4_16x8_loop_0
|
||||
bne pixel_sad_4_16x8_loop_0
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d26, d27
|
||||
vadd.u16 d2, d22, d23
|
||||
vadd.u16 d3, d18, d19
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d26, d27
|
||||
vadd.u16 d2, d22, d23
|
||||
vadd.u16 d3, d18, d19
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x16_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
|
||||
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {d6}, [r2], r3 //save pix2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {d6}, [r2], r3 //save pix2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q15, d0, d1
|
||||
vabdl.u8 q14, d0, d2
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d0, d4
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q15, d0, d1
|
||||
vabdl.u8 q14, d0, d2
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d0, d4
|
||||
|
||||
mov lr, #15
|
||||
mov lr, #15
|
||||
pixel_sad_4_8x16_loop_0:
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vmov.8 d1, d6 //save pix2 - stride
|
||||
vmov.8 d6, d2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q15, d0, d1
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vmov.8 d1, d6 //save pix2 - stride
|
||||
vmov.8 d6, d2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q15, d0, d1
|
||||
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q14, d0, d2
|
||||
vabal.u8 q13, d0, d3
|
||||
vabal.u8 q12, d0, d4
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q14, d0, d2
|
||||
vabal.u8 q13, d0, d3
|
||||
vabal.u8 q12, d0, d4
|
||||
subs lr, #1
|
||||
|
||||
bne pixel_sad_4_8x16_loop_0
|
||||
bne pixel_sad_4_8x16_loop_0
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d28, d29
|
||||
vadd.u16 d2, d26, d27
|
||||
vadd.u16 d3, d24, d25
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d28, d29
|
||||
vadd.u16 d2, d26, d27
|
||||
vadd.u16 d3, d24, d25
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour8x8_neon
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
stmdb sp!, {r4-r5, lr}
|
||||
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
//Generate the pix2 start addr
|
||||
sub r4, r2, #1
|
||||
add r5, r2, #1
|
||||
sub r2, r3
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
|
||||
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {d6}, [r2], r3 //save pix2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d1}, [r2], r3 //save pix2 - stride
|
||||
vld1.8 {d6}, [r2], r3 //save pix2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q15, d0, d1
|
||||
vabdl.u8 q14, d0, d2
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d0, d4
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q15, d0, d1
|
||||
vabdl.u8 q14, d0, d2
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d0, d4
|
||||
|
||||
mov lr, #7
|
||||
mov lr, #7
|
||||
pixel_sad_4_8x8_loop_0:
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vmov.8 d1, d6 //save pix2 - stride
|
||||
vmov.8 d6, d2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q15, d0, d1
|
||||
vld1.8 {d0}, [r0], r1 //save pix1
|
||||
vmov.8 d1, d6 //save pix2 - stride
|
||||
vmov.8 d6, d2
|
||||
vld1.8 {d2}, [r2], r3 //save pix2 + stride
|
||||
vld1.8 {d3}, [r4], r3 //save pix2 - 1
|
||||
vabal.u8 q15, d0, d1
|
||||
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q14, d0, d2
|
||||
vabal.u8 q13, d0, d3
|
||||
vabal.u8 q12, d0, d4
|
||||
vld1.8 {d4}, [r5], r3 //save pix2 + 1
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q14, d0, d2
|
||||
vabal.u8 q13, d0, d3
|
||||
vabal.u8 q12, d0, d4
|
||||
subs lr, #1
|
||||
bne pixel_sad_4_8x8_loop_0
|
||||
bne pixel_sad_4_8x8_loop_0
|
||||
|
||||
//Save SAD to 'r0'
|
||||
ldr r0, [sp, #12]
|
||||
ldr r0, [sp, #12]
|
||||
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d28, d29
|
||||
vadd.u16 d2, d26, d27
|
||||
vadd.u16 d3, d24, d25
|
||||
vadd.u16 d0, d30, d31
|
||||
vadd.u16 d1, d28, d29
|
||||
vadd.u16 d2, d26, d27
|
||||
vadd.u16 d3, d24, d25
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
ldmia sp!, {r4-r5, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSadFour4x4_neon
|
||||
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
vld1.32 {d1[0]}, [r0], r1
|
||||
vld1.32 {d1[1]}, [r0]
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
vld1.32 {d1[0]}, [r0], r1
|
||||
vld1.32 {d1[1]}, [r0]
|
||||
|
||||
|
||||
sub r0, r2, r3
|
||||
vld1.32 {d2[0]}, [r0], r3
|
||||
vld1.32 {d2[1]}, [r0], r3
|
||||
vld1.32 {d3[0]}, [r0], r3
|
||||
vld1.32 {d3[1]}, [r0], r3
|
||||
vld1.32 {d4[0]}, [r0], r3
|
||||
vld1.32 {d4[1]}, [r0]
|
||||
sub r0, r2, r3
|
||||
vld1.32 {d2[0]}, [r0], r3
|
||||
vld1.32 {d2[1]}, [r0], r3
|
||||
vld1.32 {d3[0]}, [r0], r3
|
||||
vld1.32 {d3[1]}, [r0], r3
|
||||
vld1.32 {d4[0]}, [r0], r3
|
||||
vld1.32 {d4[1]}, [r0]
|
||||
|
||||
sub r0, r2, #1
|
||||
vld1.32 {d5[0]}, [r0], r3
|
||||
vld1.32 {d5[1]}, [r0], r3
|
||||
vld1.32 {d6[0]}, [r0], r3
|
||||
vld1.32 {d6[1]}, [r0]
|
||||
sub r0, r2, #1
|
||||
vld1.32 {d5[0]}, [r0], r3
|
||||
vld1.32 {d5[1]}, [r0], r3
|
||||
vld1.32 {d6[0]}, [r0], r3
|
||||
vld1.32 {d6[1]}, [r0]
|
||||
|
||||
add r0, r2, #1
|
||||
vld1.32 {d7[0]}, [r0], r3
|
||||
vld1.32 {d7[1]}, [r0], r3
|
||||
vld1.32 {d8[0]}, [r0], r3
|
||||
vld1.32 {d8[1]}, [r0]
|
||||
add r0, r2, #1
|
||||
vld1.32 {d7[0]}, [r0], r3
|
||||
vld1.32 {d7[1]}, [r0], r3
|
||||
vld1.32 {d8[0]}, [r0], r3
|
||||
vld1.32 {d8[1]}, [r0]
|
||||
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabdl.u8 q14, d1, d3
|
||||
vabdl.u8 q15, d0, d2
|
||||
vabdl.u8 q14, d1, d3
|
||||
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d1, d4
|
||||
vabdl.u8 q13, d0, d3
|
||||
vabdl.u8 q12, d1, d4
|
||||
|
||||
vabdl.u8 q11, d0, d5
|
||||
vabdl.u8 q10, d1, d6
|
||||
vabdl.u8 q11, d0, d5
|
||||
vabdl.u8 q10, d1, d6
|
||||
|
||||
vabdl.u8 q9, d0, d7
|
||||
vabdl.u8 q8, d1, d8
|
||||
vabdl.u8 q9, d0, d7
|
||||
vabdl.u8 q8, d1, d8
|
||||
|
||||
//Save SAD to 'r4'
|
||||
ldr r0, [sp]
|
||||
vadd.u16 q0, q14, q15
|
||||
vadd.u16 q1, q12, q13
|
||||
vadd.u16 q2, q10, q11
|
||||
vadd.u16 q3, q8 , q9
|
||||
//Save SAD to 'r4'
|
||||
ldr r0, [sp]
|
||||
vadd.u16 q0, q14, q15
|
||||
vadd.u16 q1, q12, q13
|
||||
vadd.u16 q2, q10, q11
|
||||
vadd.u16 q3, q8 , q9
|
||||
|
||||
vadd.u16 d0, d1
|
||||
vadd.u16 d1, d2, d3
|
||||
vadd.u16 d2, d4, d5
|
||||
vadd.u16 d3, d6, d7
|
||||
vadd.u16 d0, d1
|
||||
vadd.u16 d1, d2, d3
|
||||
vadd.u16 d2, d4, d5
|
||||
vadd.u16 d3, d6, d7
|
||||
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
vpaddl.u16 q0, q0
|
||||
vpaddl.u16 q1, q1
|
||||
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
vpaddl.u32 q0, q0
|
||||
vpaddl.u32 q1, q1
|
||||
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
@ -834,16 +834,16 @@ WELS_ASM_FUNC_END
|
||||
WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
|
||||
|
||||
//Load the pix1 data --- 16 bytes
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
vld1.32 {d1[0]}, [r0], r1
|
||||
vld1.32 {d1[1]}, [r0]
|
||||
vld1.32 {d0[0]}, [r0], r1
|
||||
vld1.32 {d0[1]}, [r0], r1
|
||||
vld1.32 {d1[0]}, [r0], r1
|
||||
vld1.32 {d1[1]}, [r0]
|
||||
|
||||
//Load the pix2 data --- 16 bytes
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d3[0]}, [r2], r3
|
||||
vld1.32 {d3[1]}, [r2]
|
||||
vld1.32 {d2[0]}, [r2], r3
|
||||
vld1.32 {d2[1]}, [r2], r3
|
||||
vld1.32 {d3[0]}, [r2], r3
|
||||
vld1.32 {d3[1]}, [r2]
|
||||
|
||||
//Get the difference
|
||||
vsubl.u8 q15, d0, d2 //{0,1,2,3,4,5,6,7}
|
||||
@ -874,7 +874,7 @@ WELS_ASM_FUNC_BEGIN WelsSampleSatd4x4_neon
|
||||
vpaddl.u16 d0, d0
|
||||
vpaddl.u32 d0, d0
|
||||
|
||||
vmov.u32 r0, d0[0]
|
||||
vmov.u32 r0, d0[0]
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -55,262 +55,262 @@ sse2_b_1 db -1, -1, -1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
|
||||
align 16
|
||||
byte_1pos_table:
|
||||
db 0,0,0,0,0,0,0,0, ;0
|
||||
db 0,0,0,0,0,0,0,1, ;1
|
||||
db 1,0,0,0,0,0,0,1, ;2
|
||||
db 1,0,0,0,0,0,0,2, ;3
|
||||
db 2,0,0,0,0,0,0,1, ;4
|
||||
db 2,0,0,0,0,0,0,2, ;5
|
||||
db 2,1,0,0,0,0,0,2, ;6
|
||||
db 2,1,0,0,0,0,0,3, ;7
|
||||
db 3,0,0,0,0,0,0,1, ;8
|
||||
db 3,0,0,0,0,0,0,2, ;9
|
||||
db 3,1,0,0,0,0,0,2, ;10
|
||||
db 3,1,0,0,0,0,0,3, ;11
|
||||
db 3,2,0,0,0,0,0,2, ;12
|
||||
db 3,2,0,0,0,0,0,3, ;13
|
||||
db 3,2,1,0,0,0,0,3, ;14
|
||||
db 3,2,1,0,0,0,0,4, ;15
|
||||
db 4,0,0,0,0,0,0,1, ;16
|
||||
db 4,0,0,0,0,0,0,2, ;17
|
||||
db 4,1,0,0,0,0,0,2, ;18
|
||||
db 4,1,0,0,0,0,0,3, ;19
|
||||
db 4,2,0,0,0,0,0,2, ;20
|
||||
db 4,2,0,0,0,0,0,3, ;21
|
||||
db 4,2,1,0,0,0,0,3, ;22
|
||||
db 4,2,1,0,0,0,0,4, ;23
|
||||
db 4,3,0,0,0,0,0,2, ;24
|
||||
db 4,3,0,0,0,0,0,3, ;25
|
||||
db 4,3,1,0,0,0,0,3, ;26
|
||||
db 4,3,1,0,0,0,0,4, ;27
|
||||
db 4,3,2,0,0,0,0,3, ;28
|
||||
db 4,3,2,0,0,0,0,4, ;29
|
||||
db 4,3,2,1,0,0,0,4, ;30
|
||||
db 4,3,2,1,0,0,0,5, ;31
|
||||
db 5,0,0,0,0,0,0,1, ;32
|
||||
db 5,0,0,0,0,0,0,2, ;33
|
||||
db 5,1,0,0,0,0,0,2, ;34
|
||||
db 5,1,0,0,0,0,0,3, ;35
|
||||
db 5,2,0,0,0,0,0,2, ;36
|
||||
db 5,2,0,0,0,0,0,3, ;37
|
||||
db 5,2,1,0,0,0,0,3, ;38
|
||||
db 5,2,1,0,0,0,0,4, ;39
|
||||
db 5,3,0,0,0,0,0,2, ;40
|
||||
db 5,3,0,0,0,0,0,3, ;41
|
||||
db 5,3,1,0,0,0,0,3, ;42
|
||||
db 5,3,1,0,0,0,0,4, ;43
|
||||
db 5,3,2,0,0,0,0,3, ;44
|
||||
db 5,3,2,0,0,0,0,4, ;45
|
||||
db 5,3,2,1,0,0,0,4, ;46
|
||||
db 5,3,2,1,0,0,0,5, ;47
|
||||
db 5,4,0,0,0,0,0,2, ;48
|
||||
db 5,4,0,0,0,0,0,3, ;49
|
||||
db 5,4,1,0,0,0,0,3, ;50
|
||||
db 5,4,1,0,0,0,0,4, ;51
|
||||
db 5,4,2,0,0,0,0,3, ;52
|
||||
db 5,4,2,0,0,0,0,4, ;53
|
||||
db 5,4,2,1,0,0,0,4, ;54
|
||||
db 5,4,2,1,0,0,0,5, ;55
|
||||
db 5,4,3,0,0,0,0,3, ;56
|
||||
db 5,4,3,0,0,0,0,4, ;57
|
||||
db 5,4,3,1,0,0,0,4, ;58
|
||||
db 5,4,3,1,0,0,0,5, ;59
|
||||
db 5,4,3,2,0,0,0,4, ;60
|
||||
db 5,4,3,2,0,0,0,5, ;61
|
||||
db 5,4,3,2,1,0,0,5, ;62
|
||||
db 5,4,3,2,1,0,0,6, ;63
|
||||
db 6,0,0,0,0,0,0,1, ;64
|
||||
db 6,0,0,0,0,0,0,2, ;65
|
||||
db 6,1,0,0,0,0,0,2, ;66
|
||||
db 6,1,0,0,0,0,0,3, ;67
|
||||
db 6,2,0,0,0,0,0,2, ;68
|
||||
db 6,2,0,0,0,0,0,3, ;69
|
||||
db 6,2,1,0,0,0,0,3, ;70
|
||||
db 6,2,1,0,0,0,0,4, ;71
|
||||
db 6,3,0,0,0,0,0,2, ;72
|
||||
db 6,3,0,0,0,0,0,3, ;73
|
||||
db 6,3,1,0,0,0,0,3, ;74
|
||||
db 6,3,1,0,0,0,0,4, ;75
|
||||
db 6,3,2,0,0,0,0,3, ;76
|
||||
db 6,3,2,0,0,0,0,4, ;77
|
||||
db 6,3,2,1,0,0,0,4, ;78
|
||||
db 6,3,2,1,0,0,0,5, ;79
|
||||
db 6,4,0,0,0,0,0,2, ;80
|
||||
db 6,4,0,0,0,0,0,3, ;81
|
||||
db 6,4,1,0,0,0,0,3, ;82
|
||||
db 6,4,1,0,0,0,0,4, ;83
|
||||
db 6,4,2,0,0,0,0,3, ;84
|
||||
db 6,4,2,0,0,0,0,4, ;85
|
||||
db 6,4,2,1,0,0,0,4, ;86
|
||||
db 6,4,2,1,0,0,0,5, ;87
|
||||
db 6,4,3,0,0,0,0,3, ;88
|
||||
db 6,4,3,0,0,0,0,4, ;89
|
||||
db 6,4,3,1,0,0,0,4, ;90
|
||||
db 6,4,3,1,0,0,0,5, ;91
|
||||
db 6,4,3,2,0,0,0,4, ;92
|
||||
db 6,4,3,2,0,0,0,5, ;93
|
||||
db 6,4,3,2,1,0,0,5, ;94
|
||||
db 6,4,3,2,1,0,0,6, ;95
|
||||
db 6,5,0,0,0,0,0,2, ;96
|
||||
db 6,5,0,0,0,0,0,3, ;97
|
||||
db 6,5,1,0,0,0,0,3, ;98
|
||||
db 6,5,1,0,0,0,0,4, ;99
|
||||
db 6,5,2,0,0,0,0,3, ;100
|
||||
db 6,5,2,0,0,0,0,4, ;101
|
||||
db 6,5,2,1,0,0,0,4, ;102
|
||||
db 6,5,2,1,0,0,0,5, ;103
|
||||
db 6,5,3,0,0,0,0,3, ;104
|
||||
db 6,5,3,0,0,0,0,4, ;105
|
||||
db 6,5,3,1,0,0,0,4, ;106
|
||||
db 6,5,3,1,0,0,0,5, ;107
|
||||
db 6,5,3,2,0,0,0,4, ;108
|
||||
db 6,5,3,2,0,0,0,5, ;109
|
||||
db 6,5,3,2,1,0,0,5, ;110
|
||||
db 6,5,3,2,1,0,0,6, ;111
|
||||
db 6,5,4,0,0,0,0,3, ;112
|
||||
db 6,5,4,0,0,0,0,4, ;113
|
||||
db 6,5,4,1,0,0,0,4, ;114
|
||||
db 6,5,4,1,0,0,0,5, ;115
|
||||
db 6,5,4,2,0,0,0,4, ;116
|
||||
db 6,5,4,2,0,0,0,5, ;117
|
||||
db 6,5,4,2,1,0,0,5, ;118
|
||||
db 6,5,4,2,1,0,0,6, ;119
|
||||
db 6,5,4,3,0,0,0,4, ;120
|
||||
db 6,5,4,3,0,0,0,5, ;121
|
||||
db 6,5,4,3,1,0,0,5, ;122
|
||||
db 6,5,4,3,1,0,0,6, ;123
|
||||
db 6,5,4,3,2,0,0,5, ;124
|
||||
db 6,5,4,3,2,0,0,6, ;125
|
||||
db 6,5,4,3,2,1,0,6, ;126
|
||||
db 6,5,4,3,2,1,0,7, ;127
|
||||
db 7,0,0,0,0,0,0,1, ;128
|
||||
db 7,0,0,0,0,0,0,2, ;129
|
||||
db 7,1,0,0,0,0,0,2, ;130
|
||||
db 7,1,0,0,0,0,0,3, ;131
|
||||
db 7,2,0,0,0,0,0,2, ;132
|
||||
db 7,2,0,0,0,0,0,3, ;133
|
||||
db 7,2,1,0,0,0,0,3, ;134
|
||||
db 7,2,1,0,0,0,0,4, ;135
|
||||
db 7,3,0,0,0,0,0,2, ;136
|
||||
db 7,3,0,0,0,0,0,3, ;137
|
||||
db 7,3,1,0,0,0,0,3, ;138
|
||||
db 7,3,1,0,0,0,0,4, ;139
|
||||
db 7,3,2,0,0,0,0,3, ;140
|
||||
db 7,3,2,0,0,0,0,4, ;141
|
||||
db 7,3,2,1,0,0,0,4, ;142
|
||||
db 7,3,2,1,0,0,0,5, ;143
|
||||
db 7,4,0,0,0,0,0,2, ;144
|
||||
db 7,4,0,0,0,0,0,3, ;145
|
||||
db 7,4,1,0,0,0,0,3, ;146
|
||||
db 7,4,1,0,0,0,0,4, ;147
|
||||
db 7,4,2,0,0,0,0,3, ;148
|
||||
db 7,4,2,0,0,0,0,4, ;149
|
||||
db 7,4,2,1,0,0,0,4, ;150
|
||||
db 7,4,2,1,0,0,0,5, ;151
|
||||
db 7,4,3,0,0,0,0,3, ;152
|
||||
db 7,4,3,0,0,0,0,4, ;153
|
||||
db 7,4,3,1,0,0,0,4, ;154
|
||||
db 7,4,3,1,0,0,0,5, ;155
|
||||
db 7,4,3,2,0,0,0,4, ;156
|
||||
db 7,4,3,2,0,0,0,5, ;157
|
||||
db 7,4,3,2,1,0,0,5, ;158
|
||||
db 7,4,3,2,1,0,0,6, ;159
|
||||
db 7,5,0,0,0,0,0,2, ;160
|
||||
db 7,5,0,0,0,0,0,3, ;161
|
||||
db 7,5,1,0,0,0,0,3, ;162
|
||||
db 7,5,1,0,0,0,0,4, ;163
|
||||
db 7,5,2,0,0,0,0,3, ;164
|
||||
db 7,5,2,0,0,0,0,4, ;165
|
||||
db 7,5,2,1,0,0,0,4, ;166
|
||||
db 7,5,2,1,0,0,0,5, ;167
|
||||
db 7,5,3,0,0,0,0,3, ;168
|
||||
db 7,5,3,0,0,0,0,4, ;169
|
||||
db 7,5,3,1,0,0,0,4, ;170
|
||||
db 7,5,3,1,0,0,0,5, ;171
|
||||
db 7,5,3,2,0,0,0,4, ;172
|
||||
db 7,5,3,2,0,0,0,5, ;173
|
||||
db 7,5,3,2,1,0,0,5, ;174
|
||||
db 7,5,3,2,1,0,0,6, ;175
|
||||
db 7,5,4,0,0,0,0,3, ;176
|
||||
db 7,5,4,0,0,0,0,4, ;177
|
||||
db 7,5,4,1,0,0,0,4, ;178
|
||||
db 7,5,4,1,0,0,0,5, ;179
|
||||
db 7,5,4,2,0,0,0,4, ;180
|
||||
db 7,5,4,2,0,0,0,5, ;181
|
||||
db 7,5,4,2,1,0,0,5, ;182
|
||||
db 7,5,4,2,1,0,0,6, ;183
|
||||
db 7,5,4,3,0,0,0,4, ;184
|
||||
db 7,5,4,3,0,0,0,5, ;185
|
||||
db 7,5,4,3,1,0,0,5, ;186
|
||||
db 7,5,4,3,1,0,0,6, ;187
|
||||
db 7,5,4,3,2,0,0,5, ;188
|
||||
db 7,5,4,3,2,0,0,6, ;189
|
||||
db 7,5,4,3,2,1,0,6, ;190
|
||||
db 7,5,4,3,2,1,0,7, ;191
|
||||
db 7,6,0,0,0,0,0,2, ;192
|
||||
db 7,6,0,0,0,0,0,3, ;193
|
||||
db 7,6,1,0,0,0,0,3, ;194
|
||||
db 7,6,1,0,0,0,0,4, ;195
|
||||
db 7,6,2,0,0,0,0,3, ;196
|
||||
db 7,6,2,0,0,0,0,4, ;197
|
||||
db 7,6,2,1,0,0,0,4, ;198
|
||||
db 7,6,2,1,0,0,0,5, ;199
|
||||
db 7,6,3,0,0,0,0,3, ;200
|
||||
db 7,6,3,0,0,0,0,4, ;201
|
||||
db 7,6,3,1,0,0,0,4, ;202
|
||||
db 7,6,3,1,0,0,0,5, ;203
|
||||
db 7,6,3,2,0,0,0,4, ;204
|
||||
db 7,6,3,2,0,0,0,5, ;205
|
||||
db 7,6,3,2,1,0,0,5, ;206
|
||||
db 7,6,3,2,1,0,0,6, ;207
|
||||
db 7,6,4,0,0,0,0,3, ;208
|
||||
db 7,6,4,0,0,0,0,4, ;209
|
||||
db 7,6,4,1,0,0,0,4, ;210
|
||||
db 7,6,4,1,0,0,0,5, ;211
|
||||
db 7,6,4,2,0,0,0,4, ;212
|
||||
db 7,6,4,2,0,0,0,5, ;213
|
||||
db 7,6,4,2,1,0,0,5, ;214
|
||||
db 7,6,4,2,1,0,0,6, ;215
|
||||
db 7,6,4,3,0,0,0,4, ;216
|
||||
db 7,6,4,3,0,0,0,5, ;217
|
||||
db 7,6,4,3,1,0,0,5, ;218
|
||||
db 7,6,4,3,1,0,0,6, ;219
|
||||
db 7,6,4,3,2,0,0,5, ;220
|
||||
db 7,6,4,3,2,0,0,6, ;221
|
||||
db 7,6,4,3,2,1,0,6, ;222
|
||||
db 7,6,4,3,2,1,0,7, ;223
|
||||
db 7,6,5,0,0,0,0,3, ;224
|
||||
db 7,6,5,0,0,0,0,4, ;225
|
||||
db 7,6,5,1,0,0,0,4, ;226
|
||||
db 7,6,5,1,0,0,0,5, ;227
|
||||
db 7,6,5,2,0,0,0,4, ;228
|
||||
db 7,6,5,2,0,0,0,5, ;229
|
||||
db 7,6,5,2,1,0,0,5, ;230
|
||||
db 7,6,5,2,1,0,0,6, ;231
|
||||
db 7,6,5,3,0,0,0,4, ;232
|
||||
db 7,6,5,3,0,0,0,5, ;233
|
||||
db 7,6,5,3,1,0,0,5, ;234
|
||||
db 7,6,5,3,1,0,0,6, ;235
|
||||
db 7,6,5,3,2,0,0,5, ;236
|
||||
db 7,6,5,3,2,0,0,6, ;237
|
||||
db 7,6,5,3,2,1,0,6, ;238
|
||||
db 7,6,5,3,2,1,0,7, ;239
|
||||
db 7,6,5,4,0,0,0,4, ;240
|
||||
db 7,6,5,4,0,0,0,5, ;241
|
||||
db 7,6,5,4,1,0,0,5, ;242
|
||||
db 7,6,5,4,1,0,0,6, ;243
|
||||
db 7,6,5,4,2,0,0,5, ;244
|
||||
db 7,6,5,4,2,0,0,6, ;245
|
||||
db 7,6,5,4,2,1,0,6, ;246
|
||||
db 7,6,5,4,2,1,0,7, ;247
|
||||
db 7,6,5,4,3,0,0,5, ;248
|
||||
db 7,6,5,4,3,0,0,6, ;249
|
||||
db 7,6,5,4,3,1,0,6, ;250
|
||||
db 7,6,5,4,3,1,0,7, ;251
|
||||
db 7,6,5,4,3,2,0,6, ;252
|
||||
db 7,6,5,4,3,2,0,7, ;253
|
||||
db 7,6,5,4,3,2,1,7, ;254
|
||||
db 7,6,5,4,3,2,1,8, ;255
|
||||
db 0,0,0,0,0,0,0,0, ;0
|
||||
db 0,0,0,0,0,0,0,1, ;1
|
||||
db 1,0,0,0,0,0,0,1, ;2
|
||||
db 1,0,0,0,0,0,0,2, ;3
|
||||
db 2,0,0,0,0,0,0,1, ;4
|
||||
db 2,0,0,0,0,0,0,2, ;5
|
||||
db 2,1,0,0,0,0,0,2, ;6
|
||||
db 2,1,0,0,0,0,0,3, ;7
|
||||
db 3,0,0,0,0,0,0,1, ;8
|
||||
db 3,0,0,0,0,0,0,2, ;9
|
||||
db 3,1,0,0,0,0,0,2, ;10
|
||||
db 3,1,0,0,0,0,0,3, ;11
|
||||
db 3,2,0,0,0,0,0,2, ;12
|
||||
db 3,2,0,0,0,0,0,3, ;13
|
||||
db 3,2,1,0,0,0,0,3, ;14
|
||||
db 3,2,1,0,0,0,0,4, ;15
|
||||
db 4,0,0,0,0,0,0,1, ;16
|
||||
db 4,0,0,0,0,0,0,2, ;17
|
||||
db 4,1,0,0,0,0,0,2, ;18
|
||||
db 4,1,0,0,0,0,0,3, ;19
|
||||
db 4,2,0,0,0,0,0,2, ;20
|
||||
db 4,2,0,0,0,0,0,3, ;21
|
||||
db 4,2,1,0,0,0,0,3, ;22
|
||||
db 4,2,1,0,0,0,0,4, ;23
|
||||
db 4,3,0,0,0,0,0,2, ;24
|
||||
db 4,3,0,0,0,0,0,3, ;25
|
||||
db 4,3,1,0,0,0,0,3, ;26
|
||||
db 4,3,1,0,0,0,0,4, ;27
|
||||
db 4,3,2,0,0,0,0,3, ;28
|
||||
db 4,3,2,0,0,0,0,4, ;29
|
||||
db 4,3,2,1,0,0,0,4, ;30
|
||||
db 4,3,2,1,0,0,0,5, ;31
|
||||
db 5,0,0,0,0,0,0,1, ;32
|
||||
db 5,0,0,0,0,0,0,2, ;33
|
||||
db 5,1,0,0,0,0,0,2, ;34
|
||||
db 5,1,0,0,0,0,0,3, ;35
|
||||
db 5,2,0,0,0,0,0,2, ;36
|
||||
db 5,2,0,0,0,0,0,3, ;37
|
||||
db 5,2,1,0,0,0,0,3, ;38
|
||||
db 5,2,1,0,0,0,0,4, ;39
|
||||
db 5,3,0,0,0,0,0,2, ;40
|
||||
db 5,3,0,0,0,0,0,3, ;41
|
||||
db 5,3,1,0,0,0,0,3, ;42
|
||||
db 5,3,1,0,0,0,0,4, ;43
|
||||
db 5,3,2,0,0,0,0,3, ;44
|
||||
db 5,3,2,0,0,0,0,4, ;45
|
||||
db 5,3,2,1,0,0,0,4, ;46
|
||||
db 5,3,2,1,0,0,0,5, ;47
|
||||
db 5,4,0,0,0,0,0,2, ;48
|
||||
db 5,4,0,0,0,0,0,3, ;49
|
||||
db 5,4,1,0,0,0,0,3, ;50
|
||||
db 5,4,1,0,0,0,0,4, ;51
|
||||
db 5,4,2,0,0,0,0,3, ;52
|
||||
db 5,4,2,0,0,0,0,4, ;53
|
||||
db 5,4,2,1,0,0,0,4, ;54
|
||||
db 5,4,2,1,0,0,0,5, ;55
|
||||
db 5,4,3,0,0,0,0,3, ;56
|
||||
db 5,4,3,0,0,0,0,4, ;57
|
||||
db 5,4,3,1,0,0,0,4, ;58
|
||||
db 5,4,3,1,0,0,0,5, ;59
|
||||
db 5,4,3,2,0,0,0,4, ;60
|
||||
db 5,4,3,2,0,0,0,5, ;61
|
||||
db 5,4,3,2,1,0,0,5, ;62
|
||||
db 5,4,3,2,1,0,0,6, ;63
|
||||
db 6,0,0,0,0,0,0,1, ;64
|
||||
db 6,0,0,0,0,0,0,2, ;65
|
||||
db 6,1,0,0,0,0,0,2, ;66
|
||||
db 6,1,0,0,0,0,0,3, ;67
|
||||
db 6,2,0,0,0,0,0,2, ;68
|
||||
db 6,2,0,0,0,0,0,3, ;69
|
||||
db 6,2,1,0,0,0,0,3, ;70
|
||||
db 6,2,1,0,0,0,0,4, ;71
|
||||
db 6,3,0,0,0,0,0,2, ;72
|
||||
db 6,3,0,0,0,0,0,3, ;73
|
||||
db 6,3,1,0,0,0,0,3, ;74
|
||||
db 6,3,1,0,0,0,0,4, ;75
|
||||
db 6,3,2,0,0,0,0,3, ;76
|
||||
db 6,3,2,0,0,0,0,4, ;77
|
||||
db 6,3,2,1,0,0,0,4, ;78
|
||||
db 6,3,2,1,0,0,0,5, ;79
|
||||
db 6,4,0,0,0,0,0,2, ;80
|
||||
db 6,4,0,0,0,0,0,3, ;81
|
||||
db 6,4,1,0,0,0,0,3, ;82
|
||||
db 6,4,1,0,0,0,0,4, ;83
|
||||
db 6,4,2,0,0,0,0,3, ;84
|
||||
db 6,4,2,0,0,0,0,4, ;85
|
||||
db 6,4,2,1,0,0,0,4, ;86
|
||||
db 6,4,2,1,0,0,0,5, ;87
|
||||
db 6,4,3,0,0,0,0,3, ;88
|
||||
db 6,4,3,0,0,0,0,4, ;89
|
||||
db 6,4,3,1,0,0,0,4, ;90
|
||||
db 6,4,3,1,0,0,0,5, ;91
|
||||
db 6,4,3,2,0,0,0,4, ;92
|
||||
db 6,4,3,2,0,0,0,5, ;93
|
||||
db 6,4,3,2,1,0,0,5, ;94
|
||||
db 6,4,3,2,1,0,0,6, ;95
|
||||
db 6,5,0,0,0,0,0,2, ;96
|
||||
db 6,5,0,0,0,0,0,3, ;97
|
||||
db 6,5,1,0,0,0,0,3, ;98
|
||||
db 6,5,1,0,0,0,0,4, ;99
|
||||
db 6,5,2,0,0,0,0,3, ;100
|
||||
db 6,5,2,0,0,0,0,4, ;101
|
||||
db 6,5,2,1,0,0,0,4, ;102
|
||||
db 6,5,2,1,0,0,0,5, ;103
|
||||
db 6,5,3,0,0,0,0,3, ;104
|
||||
db 6,5,3,0,0,0,0,4, ;105
|
||||
db 6,5,3,1,0,0,0,4, ;106
|
||||
db 6,5,3,1,0,0,0,5, ;107
|
||||
db 6,5,3,2,0,0,0,4, ;108
|
||||
db 6,5,3,2,0,0,0,5, ;109
|
||||
db 6,5,3,2,1,0,0,5, ;110
|
||||
db 6,5,3,2,1,0,0,6, ;111
|
||||
db 6,5,4,0,0,0,0,3, ;112
|
||||
db 6,5,4,0,0,0,0,4, ;113
|
||||
db 6,5,4,1,0,0,0,4, ;114
|
||||
db 6,5,4,1,0,0,0,5, ;115
|
||||
db 6,5,4,2,0,0,0,4, ;116
|
||||
db 6,5,4,2,0,0,0,5, ;117
|
||||
db 6,5,4,2,1,0,0,5, ;118
|
||||
db 6,5,4,2,1,0,0,6, ;119
|
||||
db 6,5,4,3,0,0,0,4, ;120
|
||||
db 6,5,4,3,0,0,0,5, ;121
|
||||
db 6,5,4,3,1,0,0,5, ;122
|
||||
db 6,5,4,3,1,0,0,6, ;123
|
||||
db 6,5,4,3,2,0,0,5, ;124
|
||||
db 6,5,4,3,2,0,0,6, ;125
|
||||
db 6,5,4,3,2,1,0,6, ;126
|
||||
db 6,5,4,3,2,1,0,7, ;127
|
||||
db 7,0,0,0,0,0,0,1, ;128
|
||||
db 7,0,0,0,0,0,0,2, ;129
|
||||
db 7,1,0,0,0,0,0,2, ;130
|
||||
db 7,1,0,0,0,0,0,3, ;131
|
||||
db 7,2,0,0,0,0,0,2, ;132
|
||||
db 7,2,0,0,0,0,0,3, ;133
|
||||
db 7,2,1,0,0,0,0,3, ;134
|
||||
db 7,2,1,0,0,0,0,4, ;135
|
||||
db 7,3,0,0,0,0,0,2, ;136
|
||||
db 7,3,0,0,0,0,0,3, ;137
|
||||
db 7,3,1,0,0,0,0,3, ;138
|
||||
db 7,3,1,0,0,0,0,4, ;139
|
||||
db 7,3,2,0,0,0,0,3, ;140
|
||||
db 7,3,2,0,0,0,0,4, ;141
|
||||
db 7,3,2,1,0,0,0,4, ;142
|
||||
db 7,3,2,1,0,0,0,5, ;143
|
||||
db 7,4,0,0,0,0,0,2, ;144
|
||||
db 7,4,0,0,0,0,0,3, ;145
|
||||
db 7,4,1,0,0,0,0,3, ;146
|
||||
db 7,4,1,0,0,0,0,4, ;147
|
||||
db 7,4,2,0,0,0,0,3, ;148
|
||||
db 7,4,2,0,0,0,0,4, ;149
|
||||
db 7,4,2,1,0,0,0,4, ;150
|
||||
db 7,4,2,1,0,0,0,5, ;151
|
||||
db 7,4,3,0,0,0,0,3, ;152
|
||||
db 7,4,3,0,0,0,0,4, ;153
|
||||
db 7,4,3,1,0,0,0,4, ;154
|
||||
db 7,4,3,1,0,0,0,5, ;155
|
||||
db 7,4,3,2,0,0,0,4, ;156
|
||||
db 7,4,3,2,0,0,0,5, ;157
|
||||
db 7,4,3,2,1,0,0,5, ;158
|
||||
db 7,4,3,2,1,0,0,6, ;159
|
||||
db 7,5,0,0,0,0,0,2, ;160
|
||||
db 7,5,0,0,0,0,0,3, ;161
|
||||
db 7,5,1,0,0,0,0,3, ;162
|
||||
db 7,5,1,0,0,0,0,4, ;163
|
||||
db 7,5,2,0,0,0,0,3, ;164
|
||||
db 7,5,2,0,0,0,0,4, ;165
|
||||
db 7,5,2,1,0,0,0,4, ;166
|
||||
db 7,5,2,1,0,0,0,5, ;167
|
||||
db 7,5,3,0,0,0,0,3, ;168
|
||||
db 7,5,3,0,0,0,0,4, ;169
|
||||
db 7,5,3,1,0,0,0,4, ;170
|
||||
db 7,5,3,1,0,0,0,5, ;171
|
||||
db 7,5,3,2,0,0,0,4, ;172
|
||||
db 7,5,3,2,0,0,0,5, ;173
|
||||
db 7,5,3,2,1,0,0,5, ;174
|
||||
db 7,5,3,2,1,0,0,6, ;175
|
||||
db 7,5,4,0,0,0,0,3, ;176
|
||||
db 7,5,4,0,0,0,0,4, ;177
|
||||
db 7,5,4,1,0,0,0,4, ;178
|
||||
db 7,5,4,1,0,0,0,5, ;179
|
||||
db 7,5,4,2,0,0,0,4, ;180
|
||||
db 7,5,4,2,0,0,0,5, ;181
|
||||
db 7,5,4,2,1,0,0,5, ;182
|
||||
db 7,5,4,2,1,0,0,6, ;183
|
||||
db 7,5,4,3,0,0,0,4, ;184
|
||||
db 7,5,4,3,0,0,0,5, ;185
|
||||
db 7,5,4,3,1,0,0,5, ;186
|
||||
db 7,5,4,3,1,0,0,6, ;187
|
||||
db 7,5,4,3,2,0,0,5, ;188
|
||||
db 7,5,4,3,2,0,0,6, ;189
|
||||
db 7,5,4,3,2,1,0,6, ;190
|
||||
db 7,5,4,3,2,1,0,7, ;191
|
||||
db 7,6,0,0,0,0,0,2, ;192
|
||||
db 7,6,0,0,0,0,0,3, ;193
|
||||
db 7,6,1,0,0,0,0,3, ;194
|
||||
db 7,6,1,0,0,0,0,4, ;195
|
||||
db 7,6,2,0,0,0,0,3, ;196
|
||||
db 7,6,2,0,0,0,0,4, ;197
|
||||
db 7,6,2,1,0,0,0,4, ;198
|
||||
db 7,6,2,1,0,0,0,5, ;199
|
||||
db 7,6,3,0,0,0,0,3, ;200
|
||||
db 7,6,3,0,0,0,0,4, ;201
|
||||
db 7,6,3,1,0,0,0,4, ;202
|
||||
db 7,6,3,1,0,0,0,5, ;203
|
||||
db 7,6,3,2,0,0,0,4, ;204
|
||||
db 7,6,3,2,0,0,0,5, ;205
|
||||
db 7,6,3,2,1,0,0,5, ;206
|
||||
db 7,6,3,2,1,0,0,6, ;207
|
||||
db 7,6,4,0,0,0,0,3, ;208
|
||||
db 7,6,4,0,0,0,0,4, ;209
|
||||
db 7,6,4,1,0,0,0,4, ;210
|
||||
db 7,6,4,1,0,0,0,5, ;211
|
||||
db 7,6,4,2,0,0,0,4, ;212
|
||||
db 7,6,4,2,0,0,0,5, ;213
|
||||
db 7,6,4,2,1,0,0,5, ;214
|
||||
db 7,6,4,2,1,0,0,6, ;215
|
||||
db 7,6,4,3,0,0,0,4, ;216
|
||||
db 7,6,4,3,0,0,0,5, ;217
|
||||
db 7,6,4,3,1,0,0,5, ;218
|
||||
db 7,6,4,3,1,0,0,6, ;219
|
||||
db 7,6,4,3,2,0,0,5, ;220
|
||||
db 7,6,4,3,2,0,0,6, ;221
|
||||
db 7,6,4,3,2,1,0,6, ;222
|
||||
db 7,6,4,3,2,1,0,7, ;223
|
||||
db 7,6,5,0,0,0,0,3, ;224
|
||||
db 7,6,5,0,0,0,0,4, ;225
|
||||
db 7,6,5,1,0,0,0,4, ;226
|
||||
db 7,6,5,1,0,0,0,5, ;227
|
||||
db 7,6,5,2,0,0,0,4, ;228
|
||||
db 7,6,5,2,0,0,0,5, ;229
|
||||
db 7,6,5,2,1,0,0,5, ;230
|
||||
db 7,6,5,2,1,0,0,6, ;231
|
||||
db 7,6,5,3,0,0,0,4, ;232
|
||||
db 7,6,5,3,0,0,0,5, ;233
|
||||
db 7,6,5,3,1,0,0,5, ;234
|
||||
db 7,6,5,3,1,0,0,6, ;235
|
||||
db 7,6,5,3,2,0,0,5, ;236
|
||||
db 7,6,5,3,2,0,0,6, ;237
|
||||
db 7,6,5,3,2,1,0,6, ;238
|
||||
db 7,6,5,3,2,1,0,7, ;239
|
||||
db 7,6,5,4,0,0,0,4, ;240
|
||||
db 7,6,5,4,0,0,0,5, ;241
|
||||
db 7,6,5,4,1,0,0,5, ;242
|
||||
db 7,6,5,4,1,0,0,6, ;243
|
||||
db 7,6,5,4,2,0,0,5, ;244
|
||||
db 7,6,5,4,2,0,0,6, ;245
|
||||
db 7,6,5,4,2,1,0,6, ;246
|
||||
db 7,6,5,4,2,1,0,7, ;247
|
||||
db 7,6,5,4,3,0,0,5, ;248
|
||||
db 7,6,5,4,3,0,0,6, ;249
|
||||
db 7,6,5,4,3,1,0,6, ;250
|
||||
db 7,6,5,4,3,1,0,7, ;251
|
||||
db 7,6,5,4,3,2,0,6, ;252
|
||||
db 7,6,5,4,3,2,0,7, ;253
|
||||
db 7,6,5,4,3,2,1,7, ;254
|
||||
db 7,6,5,4,3,2,1,8, ;255
|
||||
|
||||
;***********************************************************************
|
||||
; Code
|
||||
@ -323,43 +323,43 @@ SECTION .text
|
||||
;int32_t CavlcParamCal_sse2(int16_t*coffLevel, uint8_t* run, int16_t *Level, int32_t* total_coeffs , int32_t endIdx);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN CavlcParamCal_sse2
|
||||
push ebx
|
||||
push edi
|
||||
push esi
|
||||
push ebx
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov eax, [esp+16] ;coffLevel
|
||||
mov edi, [esp+24] ;Level
|
||||
mov ebx, [esp+32] ;endIdx
|
||||
cmp ebx, 3
|
||||
jne .Level16
|
||||
pxor xmm1, xmm1
|
||||
movq xmm0, [eax] ; removed QWORD
|
||||
jmp .Cal_begin
|
||||
mov eax, [esp+16] ;coffLevel
|
||||
mov edi, [esp+24] ;Level
|
||||
mov ebx, [esp+32] ;endIdx
|
||||
cmp ebx, 3
|
||||
jne .Level16
|
||||
pxor xmm1, xmm1
|
||||
movq xmm0, [eax] ; removed QWORD
|
||||
jmp .Cal_begin
|
||||
.Level16:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax+16]
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax+16]
|
||||
.Cal_begin:
|
||||
movdqa xmm2, xmm0
|
||||
packsswb xmm0, xmm1
|
||||
movdqa xmm4, xmm0
|
||||
pxor xmm3, xmm3
|
||||
pcmpgtb xmm0, xmm3
|
||||
pcmpgtb xmm3, xmm4
|
||||
por xmm0, xmm3
|
||||
pmovmskb edx, xmm0
|
||||
cmp edx, 0
|
||||
je near .return
|
||||
movdqa xmm6, [sse2_b_1]
|
||||
pcmpeqw xmm7, xmm7 ;generate -1
|
||||
mov ebx, 0xff
|
||||
;pinsrw xmm6, ebx, 3
|
||||
movdqa xmm2, xmm0
|
||||
packsswb xmm0, xmm1
|
||||
movdqa xmm4, xmm0
|
||||
pxor xmm3, xmm3
|
||||
pcmpgtb xmm0, xmm3
|
||||
pcmpgtb xmm3, xmm4
|
||||
por xmm0, xmm3
|
||||
pmovmskb edx, xmm0
|
||||
cmp edx, 0
|
||||
je near .return
|
||||
movdqa xmm6, [sse2_b_1]
|
||||
pcmpeqw xmm7, xmm7 ;generate -1
|
||||
mov ebx, 0xff
|
||||
;pinsrw xmm6, ebx, 3
|
||||
|
||||
mov bl, dh
|
||||
|
||||
lea ebx, [byte_1pos_table+8*ebx]
|
||||
movq xmm0, [ebx]
|
||||
pextrw ecx, xmm0, 3
|
||||
shr ecx, 8
|
||||
lea ebx, [byte_1pos_table+8*ebx]
|
||||
movq xmm0, [ebx]
|
||||
pextrw ecx, xmm0, 3
|
||||
shr ecx, 8
|
||||
mov dh, cl
|
||||
|
||||
.loopHighFind0:
|
||||
@ -367,19 +367,19 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
je .loopHighFind0End
|
||||
;mov esi, [ebx]
|
||||
;and esi, 0xff
|
||||
movzx esi, byte [ebx]
|
||||
movzx esi, byte [ebx]
|
||||
add esi, 8
|
||||
mov esi, [eax+2*esi]
|
||||
mov [edi], si
|
||||
add edi, 2
|
||||
;add ebx, 1
|
||||
inc ebx
|
||||
inc ebx
|
||||
dec ecx
|
||||
jmp .loopHighFind0
|
||||
jmp .loopHighFind0
|
||||
.loopHighFind0End:
|
||||
mov cl, dh
|
||||
cmp cl, 8
|
||||
pand xmm0, xmm6
|
||||
pand xmm0, xmm6
|
||||
jne .LowByteFind0
|
||||
sub edi, 2
|
||||
mov esi, [eax+16]
|
||||
@ -387,8 +387,8 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
add edi, 2
|
||||
.LowByteFind0:
|
||||
and edx, 0xff
|
||||
lea ebx, [byte_1pos_table+8*edx]
|
||||
movq xmm1, [ebx]
|
||||
lea ebx, [byte_1pos_table+8*edx]
|
||||
movq xmm1, [ebx]
|
||||
pextrw esi, xmm1, 3
|
||||
or esi, 0xff
|
||||
or ecx, 0xff00
|
||||
@ -398,16 +398,16 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
.loopLowFind0:
|
||||
cmp esi, 0
|
||||
je .loopLowFind0End
|
||||
;mov edx, [ebx]
|
||||
;and edx, 0xff
|
||||
movzx edx, byte [ebx]
|
||||
mov edx, [eax+2*edx]
|
||||
mov [edi], dx
|
||||
add edi, 2
|
||||
;add ebx, 1
|
||||
inc ebx
|
||||
;mov edx, [ebx]
|
||||
;and edx, 0xff
|
||||
movzx edx, byte [ebx]
|
||||
mov edx, [eax+2*edx]
|
||||
mov [edi], dx
|
||||
add edi, 2
|
||||
;add ebx, 1
|
||||
inc ebx
|
||||
dec esi
|
||||
jmp .loopLowFind0
|
||||
jmp .loopLowFind0
|
||||
.loopLowFind0End:
|
||||
cmp ch, 8
|
||||
jne .getLevelEnd
|
||||
@ -415,12 +415,12 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
mov edx, [eax]
|
||||
mov [edi], dx
|
||||
.getLevelEnd:
|
||||
mov edx, [esp+28] ;total_coeffs
|
||||
mov edx, [esp+28] ;total_coeffs
|
||||
;mov ebx, ecx
|
||||
;and ebx, 0xff
|
||||
movzx ebx, byte cl
|
||||
movzx ebx, byte cl
|
||||
add cl, ch
|
||||
mov [edx], cl
|
||||
mov [edx], cl
|
||||
;getRun
|
||||
movq xmm5, [sse2_b8]
|
||||
paddb xmm0, xmm5
|
||||
@ -430,7 +430,7 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
sub eax, ebx
|
||||
shl eax, 3
|
||||
shl ebx, 3
|
||||
pinsrw xmm2, ebx, 0
|
||||
pinsrw xmm2, ebx, 0
|
||||
pinsrw xmm3, eax, 0
|
||||
psllq xmm0, xmm3
|
||||
psrlq xmm0, xmm3
|
||||
@ -441,19 +441,19 @@ WELS_EXTERN CavlcParamCal_sse2
|
||||
por xmm0, xmm1
|
||||
|
||||
pextrw eax, xmm0, 0
|
||||
and eax, 0xff
|
||||
and eax, 0xff
|
||||
inc eax
|
||||
sub al, cl
|
||||
movdqa xmm1, xmm0
|
||||
paddb xmm1, xmm7
|
||||
psrldq xmm0, 1
|
||||
psubb xmm1, xmm0
|
||||
movdqa xmm1, xmm0
|
||||
paddb xmm1, xmm7
|
||||
psrldq xmm0, 1
|
||||
psubb xmm1, xmm0
|
||||
mov ecx, [esp+20] ;run
|
||||
movdqa [ecx], xmm1
|
||||
movdqa [ecx], xmm1
|
||||
;getRunEnd
|
||||
.return:
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebx
|
||||
ret
|
||||
pop esi
|
||||
pop edi
|
||||
pop ebx
|
||||
ret
|
||||
%endif
|
||||
|
@ -50,17 +50,17 @@ SECTION .rodata align=16
|
||||
|
||||
align 16
|
||||
SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
||||
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
||||
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
||||
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
||||
dw 11, 14, 11, 14, 14, 18, 14, 18,
|
||||
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
||||
dw 13, 16, 13, 16, 16, 20, 16, 20,
|
||||
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
||||
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
||||
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
||||
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
||||
dw 14, 18, 14, 18, 18, 23, 18, 23,
|
||||
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
||||
dw 16, 20, 16, 20, 20, 25, 20, 25,
|
||||
dw 18, 23, 18, 23, 23, 29, 23, 29,
|
||||
dw 18, 23, 18, 23, 23, 29, 23, 29
|
||||
dw 18, 23, 18, 23, 23, 29, 23, 29
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
@ -68,27 +68,27 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
;***********************************************************************
|
||||
|
||||
%macro MMX_LoadDiff4P 5
|
||||
movd %1, [%3]
|
||||
movd %2, [%4]
|
||||
punpcklbw %1, %5
|
||||
punpcklbw %2, %5
|
||||
psubw %1, %2
|
||||
movd %1, [%3]
|
||||
movd %2, [%4]
|
||||
punpcklbw %1, %5
|
||||
punpcklbw %2, %5
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
%macro MMX_LoadDiff4x4P 10 ;d0, d1, d2, d3, pix1address, pix1stride, pix2address, pix2stride, tmp(mm), 0(mm)
|
||||
MMX_LoadDiff4P %1, %9, %5, %7, %10
|
||||
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
|
||||
lea %5, [%5+2*%6]
|
||||
lea %7, [%7+2*%8]
|
||||
MMX_LoadDiff4P %3, %9, %5, %7, %10
|
||||
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
|
||||
MMX_LoadDiff4P %1, %9, %5, %7, %10
|
||||
MMX_LoadDiff4P %2, %9, %5+%6, %7+%8, %10
|
||||
lea %5, [%5+2*%6]
|
||||
lea %7, [%7+2*%8]
|
||||
MMX_LoadDiff4P %3, %9, %5, %7, %10
|
||||
MMX_LoadDiff4P %4, %9, %5+%6, %7+%8, %10
|
||||
%endmacro
|
||||
|
||||
%macro MMX_SumSubMul2 3
|
||||
movq %3, %1
|
||||
psllw %1, $01
|
||||
paddw %1, %2
|
||||
psllw %2, $01
|
||||
movq %3, %1
|
||||
psllw %1, $01
|
||||
paddw %1, %2
|
||||
psllw %2, $01
|
||||
psubw %3, %2
|
||||
%endmacro
|
||||
|
||||
@ -101,23 +101,23 @@ SSE2_DeQuant8 dw 10, 13, 10, 13, 13, 16, 13, 16,
|
||||
%endmacro
|
||||
|
||||
%macro MMX_SumSub 3
|
||||
movq %3, %2
|
||||
movq %3, %2
|
||||
psubw %2, %1
|
||||
paddw %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro MMX_DCT 6
|
||||
MMX_SumSub %4, %1, %6
|
||||
MMX_SumSub %3, %2, %6
|
||||
MMX_SumSub %3, %4, %6
|
||||
MMX_SumSub %4, %1, %6
|
||||
MMX_SumSub %3, %2, %6
|
||||
MMX_SumSub %3, %4, %6
|
||||
MMX_SumSubMul2 %1, %2, %5
|
||||
%endmacro
|
||||
|
||||
%macro MMX_IDCT 6
|
||||
MMX_SumSub %4, %5, %6
|
||||
MMX_SumSubDiv2 %3, %2, %1
|
||||
MMX_SumSub %1, %4, %6
|
||||
MMX_SumSub %3, %5, %6
|
||||
MMX_SumSub %1, %4, %6
|
||||
MMX_SumSub %3, %5, %6
|
||||
%endmacro
|
||||
|
||||
%macro MMX_StoreDiff4P 6
|
||||
@ -142,11 +142,11 @@ WELS_EXTERN WelsDctT4_mmx
|
||||
|
||||
MMX_LoadDiff4x4P mm1, mm2, mm3, mm4, r1, r2, r3, r4, mm0, mm7
|
||||
|
||||
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
|
||||
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
|
||||
MMX_DCT mm1, mm2, mm3 ,mm4, mm5, mm6
|
||||
MMX_Trans4x4W mm3, mm1, mm4, mm5, mm2
|
||||
|
||||
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
|
||||
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
|
||||
MMX_DCT mm3, mm5, mm2 ,mm4, mm1, mm6
|
||||
MMX_Trans4x4W mm2, mm3, mm4, mm1, mm5
|
||||
|
||||
movq [r0+ 0], mm2
|
||||
movq [r0+ 8], mm1
|
||||
@ -170,22 +170,22 @@ WELS_EXTERN WelsIDctT4Rec_mmx
|
||||
movq mm2, [r4+16]
|
||||
movq mm3, [r4+24]
|
||||
|
||||
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
|
||||
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
|
||||
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
|
||||
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
|
||||
MMX_Trans4x4W mm0, mm1, mm2, mm3, mm4
|
||||
MMX_IDCT mm1, mm2, mm3, mm4, mm0, mm6
|
||||
MMX_Trans4x4W mm1, mm3, mm0, mm4, mm2
|
||||
MMX_IDCT mm3, mm0, mm4, mm2, mm1, mm6
|
||||
|
||||
WELS_Zero mm7
|
||||
WELS_DW32 mm6
|
||||
WELS_Zero mm7
|
||||
WELS_DW32 mm6
|
||||
|
||||
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
|
||||
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
|
||||
MMX_StoreDiff4P mm3, mm0, mm6, mm7, [r0], [r2]
|
||||
MMX_StoreDiff4P mm4, mm0, mm6, mm7, [r0+r1], [r2+r3]
|
||||
lea r0, [r0+2*r1]
|
||||
lea r2, [r2+2*r3]
|
||||
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
|
||||
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
|
||||
MMX_StoreDiff4P mm1, mm0, mm6, mm7, [r0], [r2]
|
||||
MMX_StoreDiff4P mm2, mm0, mm6, mm7, [r0+r1], [r2+r3]
|
||||
|
||||
WELSEMMS
|
||||
WELSEMMS
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
@ -194,21 +194,21 @@ WELS_EXTERN WelsIDctT4Rec_mmx
|
||||
; SSE2 functions
|
||||
;***********************************************************************
|
||||
%macro SSE2_Store4x8p 6
|
||||
SSE2_XSawp qdq, %2, %3, %6
|
||||
SSE2_XSawp qdq, %4, %5, %3
|
||||
MOVDQ [%1+0x00], %2
|
||||
MOVDQ [%1+0x10], %4
|
||||
MOVDQ [%1+0x20], %6
|
||||
MOVDQ [%1+0x30], %3
|
||||
SSE2_XSawp qdq, %2, %3, %6
|
||||
SSE2_XSawp qdq, %4, %5, %3
|
||||
MOVDQ [%1+0x00], %2
|
||||
MOVDQ [%1+0x10], %4
|
||||
MOVDQ [%1+0x20], %6
|
||||
MOVDQ [%1+0x30], %3
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_Load4x8p 6
|
||||
MOVDQ %2, [%1+0x00]
|
||||
MOVDQ %4, [%1+0x10]
|
||||
MOVDQ %6, [%1+0x20]
|
||||
MOVDQ %3, [%1+0x30]
|
||||
SSE2_XSawp qdq, %4, %3, %5
|
||||
SSE2_XSawp qdq, %2, %6, %3
|
||||
MOVDQ %2, [%1+0x00]
|
||||
MOVDQ %4, [%1+0x10]
|
||||
MOVDQ %6, [%1+0x20]
|
||||
MOVDQ %3, [%1+0x30]
|
||||
SSE2_XSawp qdq, %4, %3, %5
|
||||
SSE2_XSawp qdq, %2, %6, %3
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_SumSubMul2 3
|
||||
@ -231,57 +231,57 @@ WELS_EXTERN WelsIDctT4Rec_mmx
|
||||
%macro SSE2_StoreDiff8p 6
|
||||
paddw %1, %3
|
||||
psraw %1, $06
|
||||
movq %2, %6
|
||||
movq %2, %6
|
||||
punpcklbw %2, %4
|
||||
paddsw %2, %1
|
||||
packuswb %2, %2
|
||||
movq %5, %2
|
||||
movq %5, %2
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_StoreDiff8p 5
|
||||
movq %2, %5
|
||||
movq %2, %5
|
||||
punpcklbw %2, %3
|
||||
paddsw %2, %1
|
||||
packuswb %2, %2
|
||||
movq %4, %2
|
||||
movq %4, %2
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_Load8DC 6
|
||||
movdqa %1, %6 ; %1 = dc0 dc1
|
||||
paddw %1, %5
|
||||
psraw %1, $06 ; (dc + 32) >> 6
|
||||
%macro SSE2_Load8DC 6
|
||||
movdqa %1, %6 ; %1 = dc0 dc1
|
||||
paddw %1, %5
|
||||
psraw %1, $06 ; (dc + 32) >> 6
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklwd %2, %2
|
||||
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklwd %2, %2
|
||||
punpckldq %2, %2 ; %2 = dc2 dc2 dc2 dc2 dc3 dc3 dc3 dc3
|
||||
|
||||
movdqa %3, %1
|
||||
psrldq %3, 8
|
||||
punpcklwd %3, %3
|
||||
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
|
||||
movdqa %3, %1
|
||||
psrldq %3, 8
|
||||
punpcklwd %3, %3
|
||||
punpckldq %3, %3 ; %3 = dc4 dc4 dc4 dc4 dc5 dc5 dc5 dc5
|
||||
|
||||
movdqa %4, %1
|
||||
psrldq %4, 12
|
||||
punpcklwd %4, %4
|
||||
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
|
||||
movdqa %4, %1
|
||||
psrldq %4, 12
|
||||
punpcklwd %4, %4
|
||||
punpckldq %4, %4 ; %4 = dc6 dc6 dc6 dc6 dc7 dc7 dc7 dc7
|
||||
|
||||
punpcklwd %1, %1
|
||||
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
|
||||
punpcklwd %1, %1
|
||||
punpckldq %1, %1 ; %1 = dc0 dc0 dc0 dc0 dc1 dc1 dc1 dc1
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_DCT 6
|
||||
SSE2_SumSub %6, %3, %5
|
||||
SSE2_SumSub %1, %2, %5
|
||||
SSE2_SumSub %3, %2, %5
|
||||
SSE2_SumSubMul2 %6, %1, %4
|
||||
SSE2_SumSub %6, %3, %5
|
||||
SSE2_SumSub %1, %2, %5
|
||||
SSE2_SumSub %3, %2, %5
|
||||
SSE2_SumSubMul2 %6, %1, %4
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_IDCT 7
|
||||
SSE2_SumSub %7, %2, %6
|
||||
SSE2_SumSubDiv2 %1, %3, %5, %4
|
||||
SSE2_SumSub %2, %1, %5
|
||||
SSE2_SumSub %7, %4, %5
|
||||
SSE2_SumSub %2, %1, %5
|
||||
SSE2_SumSub %7, %4, %5
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
@ -294,42 +294,42 @@ WELS_EXTERN WelsDctFourT4_sse2
|
||||
SIGN_EXTENSION r2, r2d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
pxor xmm7, xmm7
|
||||
;Load 4x8
|
||||
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
|
||||
;Load 4x8
|
||||
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1], [r3]
|
||||
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2], [r3+r4]
|
||||
lea r1, [r1 + 2 * r2]
|
||||
lea r3, [r3 + 2 * r4]
|
||||
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
|
||||
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
|
||||
|
||||
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
||||
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
||||
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
||||
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
||||
|
||||
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
||||
|
||||
lea r1, [r1 + 2 * r2]
|
||||
lea r3, [r3 + 2 * r4]
|
||||
|
||||
;Load 4x8
|
||||
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
|
||||
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
|
||||
lea r1, [r1 + 2 * r2]
|
||||
lea r3, [r3 + 2 * r4]
|
||||
lea r1, [r1 + 2 * r2]
|
||||
lea r3, [r3 + 2 * r4]
|
||||
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
|
||||
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
|
||||
|
||||
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
||||
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
||||
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
||||
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
||||
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
||||
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
||||
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
||||
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
||||
|
||||
lea r0, [r0+64]
|
||||
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
||||
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
||||
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
lea r1, [r1 + 2 * r2]
|
||||
lea r3, [r3 + 2 * r4]
|
||||
|
||||
;Load 4x8
|
||||
SSE2_LoadDiff8P xmm0, xmm6, xmm7, [r1 ], [r3 ]
|
||||
SSE2_LoadDiff8P xmm1, xmm6, xmm7, [r1+r2 ], [r3+r4]
|
||||
lea r1, [r1 + 2 * r2]
|
||||
lea r3, [r3 + 2 * r4]
|
||||
SSE2_LoadDiff8P xmm2, xmm6, xmm7, [r1], [r3]
|
||||
SSE2_LoadDiff8P xmm3, xmm6, xmm7, [r1+r2], [r3+r4]
|
||||
|
||||
SSE2_DCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm0
|
||||
SSE2_TransTwo4x4W xmm2, xmm0, xmm3, xmm4, xmm1
|
||||
SSE2_DCT xmm0, xmm4, xmm1, xmm3, xmm5, xmm2
|
||||
SSE2_TransTwo4x4W xmm4, xmm2, xmm1, xmm3, xmm0
|
||||
|
||||
lea r0, [r0+64]
|
||||
SSE2_Store4x8p r0, xmm4, xmm2, xmm3, xmm0, xmm5
|
||||
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
@ -337,168 +337,168 @@ WELS_EXTERN WelsDctFourT4_sse2
|
||||
; void WelsIDctFourT4Rec_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *rs);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsIDctFourT4Rec_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
;Load 4x8
|
||||
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
;Load 4x8
|
||||
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
|
||||
|
||||
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
||||
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
||||
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
||||
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
||||
|
||||
WELS_Zero xmm7
|
||||
WELS_DW32 xmm6
|
||||
|
||||
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
|
||||
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
||||
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
||||
|
||||
add r4, 64
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
|
||||
|
||||
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
||||
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
||||
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
||||
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
||||
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
||||
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
||||
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
||||
|
||||
WELS_Zero xmm7
|
||||
WELS_DW32 xmm6
|
||||
WELS_Zero xmm7
|
||||
WELS_DW32 xmm6
|
||||
|
||||
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
|
||||
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
||||
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
; pop esi
|
||||
; pop ebx
|
||||
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
|
||||
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
||||
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
||||
|
||||
add r4, 64
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_Load4x8p r4, xmm0, xmm1, xmm4, xmm2, xmm5
|
||||
|
||||
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
||||
SSE2_IDCT xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm0
|
||||
SSE2_TransTwo4x4W xmm1, xmm4, xmm0, xmm2, xmm3
|
||||
SSE2_IDCT xmm4, xmm2, xmm3, xmm0, xmm5, xmm6, xmm1
|
||||
|
||||
WELS_Zero xmm7
|
||||
WELS_DW32 xmm6
|
||||
|
||||
SSE2_StoreDiff8p xmm4, xmm5, xmm6, xmm7, [r0 ], [r2]
|
||||
SSE2_StoreDiff8p xmm0, xmm5, xmm6, xmm7, [r0 + r1 ], [r2 + r3]
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff8p xmm1, xmm5, xmm6, xmm7, [r0], [r2]
|
||||
SSE2_StoreDiff8p xmm2, xmm5, xmm6, xmm7, [r0 + r1], [r2 + r3]
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
; pop esi
|
||||
; pop ebx
|
||||
ret
|
||||
|
||||
%macro SSE2_StoreDiff4x8p 8
|
||||
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
|
||||
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
|
||||
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
|
||||
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
|
||||
SSE2_StoreDiff8p %1, %3, %4, [%5], [%6]
|
||||
SSE2_StoreDiff8p %1, %3, %4, [%5 + %7], [%6 + %8]
|
||||
SSE2_StoreDiff8p %2, %3, %4, [%5 + 8], [%6 + 8]
|
||||
SSE2_StoreDiff8p %2, %3, %4, [%5 + %7 + 8], [%6 + %8 + 8]
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsIDctRecI16x16Dc_sse2(uint8_t *rec, int32_t stride, uint8_t *pred, int32_t pred_stride, int16_t *dct_dc)
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsIDctRecI16x16Dc_sse2
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm7, xmm7
|
||||
WELS_DW32 xmm6
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm7, xmm7
|
||||
WELS_DW32 xmm6
|
||||
|
||||
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
SSE2_Load8DC xmm0, xmm1, xmm2, xmm3, xmm6, [r4 + 16]
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm0, xmm1, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
lea r0, [r0 + 2 * r1]
|
||||
lea r2, [r2 + 2 * r3]
|
||||
SSE2_StoreDiff4x8p xmm2, xmm3, xmm5, xmm7, r0, r2, r1, r3
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
|
||||
|
||||
%macro SSE2_SumSubD 3
|
||||
movdqa %3, %2
|
||||
movdqa %3, %2
|
||||
paddd %2, %1
|
||||
psubd %1, %3
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_SumSubDiv2D 4
|
||||
paddd %1, %2
|
||||
paddd %1, %3
|
||||
psrad %1, 1
|
||||
movdqa %4, %1
|
||||
psubd %4, %2
|
||||
paddd %1, %2
|
||||
paddd %1, %3
|
||||
psrad %1, 1
|
||||
movdqa %4, %1
|
||||
psubd %4, %2
|
||||
%endmacro
|
||||
%macro SSE2_Load4Col 5
|
||||
movsx r2, WORD[%5]
|
||||
movd %1, r2d
|
||||
movsx r2, WORD[%5 + 0x20]
|
||||
movd %2, r2d
|
||||
punpckldq %1, %2
|
||||
movsx r2, WORD[%5 + 0x80]
|
||||
movd %3, r2d
|
||||
movsx r2, WORD[%5 + 0xa0]
|
||||
movd %4, r2d
|
||||
punpckldq %3, %4
|
||||
punpcklqdq %1, %3
|
||||
%macro SSE2_Load4Col 5
|
||||
movsx r2, WORD[%5]
|
||||
movd %1, r2d
|
||||
movsx r2, WORD[%5 + 0x20]
|
||||
movd %2, r2d
|
||||
punpckldq %1, %2
|
||||
movsx r2, WORD[%5 + 0x80]
|
||||
movd %3, r2d
|
||||
movsx r2, WORD[%5 + 0xa0]
|
||||
movd %4, r2d
|
||||
punpckldq %3, %4
|
||||
punpcklqdq %1, %3
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
;void WelsHadamardT4Dc_sse2( int16_t *luma_dc, int16_t *pDct)
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsHadamardT4Dc_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
|
||||
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
|
||||
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
|
||||
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
SSE2_Load4Col xmm1, xmm5, xmm6, xmm0, r1
|
||||
SSE2_Load4Col xmm2, xmm5, xmm6, xmm0, r1 + 0x40
|
||||
SSE2_Load4Col xmm3, xmm5, xmm6, xmm0, r1 + 0x100
|
||||
SSE2_Load4Col xmm4, xmm5, xmm6, xmm0, r1 + 0x140
|
||||
|
||||
SSE2_SumSubD xmm1, xmm2, xmm7
|
||||
SSE2_SumSubD xmm3, xmm4, xmm7
|
||||
SSE2_SumSubD xmm2, xmm4, xmm7
|
||||
SSE2_SumSubD xmm1, xmm3, xmm7
|
||||
SSE2_SumSubD xmm1, xmm2, xmm7
|
||||
SSE2_SumSubD xmm3, xmm4, xmm7
|
||||
SSE2_SumSubD xmm2, xmm4, xmm7
|
||||
SSE2_SumSubD xmm1, xmm3, xmm7
|
||||
|
||||
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
|
||||
SSE2_Trans4x4D xmm4, xmm2, xmm1, xmm3, xmm5 ; pOut: xmm4,xmm3,xmm5,xmm1
|
||||
|
||||
SSE2_SumSubD xmm4, xmm3, xmm7
|
||||
SSE2_SumSubD xmm5, xmm1, xmm7
|
||||
SSE2_SumSubD xmm4, xmm3, xmm7
|
||||
SSE2_SumSubD xmm5, xmm1, xmm7
|
||||
|
||||
WELS_DD1 xmm6
|
||||
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
|
||||
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
|
||||
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
|
||||
WELS_DD1 xmm6
|
||||
SSE2_SumSubDiv2D xmm3, xmm1, xmm6, xmm0 ; pOut: xmm3 = (xmm3+xmm1+1)/2, xmm0 = (xmm3-xmm1+1)/2
|
||||
SSE2_SumSubDiv2D xmm4, xmm5, xmm6, xmm1 ; pOut: xmm4 = (xmm4+xmm5+1)/2, xmm1 = (xmm4-xmm5+1)/2
|
||||
SSE2_Trans4x4D xmm3, xmm0, xmm1, xmm4, xmm2 ; pOut: xmm3,xmm4,xmm2,xmm1
|
||||
|
||||
packssdw xmm3, xmm4
|
||||
packssdw xmm2, xmm1
|
||||
movdqa [r0+ 0], xmm3
|
||||
movdqa [r0+16], xmm2
|
||||
packssdw xmm3, xmm4
|
||||
packssdw xmm2, xmm1
|
||||
movdqa [r0+ 0], xmm3
|
||||
movdqa [r0+16], xmm2
|
||||
|
||||
POP_XMM
|
||||
ret
|
||||
POP_XMM
|
||||
ret
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -34,362 +34,362 @@
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
%macro TRANSPOSE_8x8B_MMX 10
|
||||
MMX_XSwap bw, %1, %2, %8
|
||||
MMX_XSwap bw, %3, %4, %2
|
||||
MMX_XSwap bw, %5, %6, %4
|
||||
movq %6, %9
|
||||
movq %10, %4
|
||||
MMX_XSwap bw, %7, %6, %4
|
||||
MMX_XSwap bw, %1, %2, %8
|
||||
MMX_XSwap bw, %3, %4, %2
|
||||
MMX_XSwap bw, %5, %6, %4
|
||||
movq %6, %9
|
||||
movq %10, %4
|
||||
MMX_XSwap bw, %7, %6, %4
|
||||
|
||||
MMX_XSwap wd, %1, %3, %6
|
||||
MMX_XSwap wd, %8, %2, %3
|
||||
MMX_XSwap wd, %5, %7, %2
|
||||
movq %7, %10
|
||||
movq %10, %3
|
||||
MMX_XSwap wd, %7, %4, %3
|
||||
MMX_XSwap wd, %1, %3, %6
|
||||
MMX_XSwap wd, %8, %2, %3
|
||||
MMX_XSwap wd, %5, %7, %2
|
||||
movq %7, %10
|
||||
movq %10, %3
|
||||
MMX_XSwap wd, %7, %4, %3
|
||||
|
||||
MMX_XSwap dq, %1, %5, %4
|
||||
MMX_XSwap dq, %6, %2, %5
|
||||
MMX_XSwap dq, %8, %7, %2
|
||||
movq %7, %10
|
||||
movq %10, %5
|
||||
MMX_XSwap dq, %7, %3, %5
|
||||
MMX_XSwap dq, %1, %5, %4
|
||||
MMX_XSwap dq, %6, %2, %5
|
||||
MMX_XSwap dq, %8, %7, %2
|
||||
movq %7, %10
|
||||
movq %10, %5
|
||||
MMX_XSwap dq, %7, %3, %5
|
||||
|
||||
movq %3, %10
|
||||
movq %3, %10
|
||||
%endmacro
|
||||
|
||||
;in: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
|
||||
movq [%1], mm0 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], mm3 ; result of line 2
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], mm5 ; result of line 3
|
||||
movq [%1+%2], mm2 ; result of line 4
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], mm7 ; result of line 5
|
||||
movq [%1+%2], mm1 ; result of line 6
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], mm6 ; result of line 7
|
||||
movq [%1+%2], mm4 ; result of line 8
|
||||
%macro TRANSPOSE8x8_WRITE_MMX 2 ; dst, dst_stride
|
||||
movq [%1], mm0 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], mm3 ; result of line 2
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], mm5 ; result of line 3
|
||||
movq [%1+%2], mm2 ; result of line 4
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], mm7 ; result of line 5
|
||||
movq [%1+%2], mm1 ; result of line 6
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], mm6 ; result of line 7
|
||||
movq [%1+%2], mm4 ; result of line 8
|
||||
%endmacro
|
||||
|
||||
;in: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
|
||||
movq [%1], mm0 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], mm3 ; result of line 2
|
||||
lea %3, [%1+2*%2]
|
||||
movq [%3], mm5 ; result of line 3
|
||||
movq [%3+%2], mm2 ; result of line 4
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], mm7 ; result of line 5
|
||||
movq [%3+%2], mm1 ; result of line 6
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], mm6 ; result of line 7
|
||||
movq [%3+%2], mm4 ; result of line 8
|
||||
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
|
||||
%macro TRANSPOSE8x8_WRITE_ALT_MMX 3 ; dst, dst_stride, reg32
|
||||
movq [%1], mm0 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], mm3 ; result of line 2
|
||||
lea %3, [%1+2*%2]
|
||||
movq [%3], mm5 ; result of line 3
|
||||
movq [%3+%2], mm2 ; result of line 4
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], mm7 ; result of line 5
|
||||
movq [%3+%2], mm1 ; result of line 6
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], mm6 ; result of line 7
|
||||
movq [%3+%2], mm4 ; result of line 8
|
||||
%endmacro ; end of TRANSPOSE8x8_WRITE_ALT_MMX
|
||||
|
||||
; for transpose 16x8
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
%macro TRANSPOSE_8x16B_SSE2 10
|
||||
SSE2_XSawp bw, %1, %2, %8
|
||||
SSE2_XSawp bw, %3, %4, %2
|
||||
SSE2_XSawp bw, %5, %6, %4
|
||||
movdqa %6, %9
|
||||
movdqa %10, %4
|
||||
SSE2_XSawp bw, %7, %6, %4
|
||||
%macro TRANSPOSE_8x16B_SSE2 10
|
||||
SSE2_XSawp bw, %1, %2, %8
|
||||
SSE2_XSawp bw, %3, %4, %2
|
||||
SSE2_XSawp bw, %5, %6, %4
|
||||
movdqa %6, %9
|
||||
movdqa %10, %4
|
||||
SSE2_XSawp bw, %7, %6, %4
|
||||
|
||||
SSE2_XSawp wd, %1, %3, %6
|
||||
SSE2_XSawp wd, %8, %2, %3
|
||||
SSE2_XSawp wd, %5, %7, %2
|
||||
movdqa %7, %10
|
||||
movdqa %10, %3
|
||||
SSE2_XSawp wd, %7, %4, %3
|
||||
SSE2_XSawp wd, %1, %3, %6
|
||||
SSE2_XSawp wd, %8, %2, %3
|
||||
SSE2_XSawp wd, %5, %7, %2
|
||||
movdqa %7, %10
|
||||
movdqa %10, %3
|
||||
SSE2_XSawp wd, %7, %4, %3
|
||||
|
||||
SSE2_XSawp dq, %1, %5, %4
|
||||
SSE2_XSawp dq, %6, %2, %5
|
||||
SSE2_XSawp dq, %8, %7, %2
|
||||
movdqa %7, %10
|
||||
movdqa %10, %5
|
||||
SSE2_XSawp dq, %7, %3, %5
|
||||
SSE2_XSawp dq, %1, %5, %4
|
||||
SSE2_XSawp dq, %6, %2, %5
|
||||
SSE2_XSawp dq, %8, %7, %2
|
||||
movdqa %7, %10
|
||||
movdqa %10, %5
|
||||
SSE2_XSawp dq, %7, %3, %5
|
||||
|
||||
SSE2_XSawp qdq, %1, %8, %3
|
||||
SSE2_XSawp qdq, %4, %2, %8
|
||||
SSE2_XSawp qdq, %6, %7, %2
|
||||
movdqa %7, %10
|
||||
movdqa %10, %1
|
||||
SSE2_XSawp qdq, %7, %5, %1
|
||||
movdqa %5, %10
|
||||
%endmacro ; end of TRANSPOSE_8x16B_SSE2
|
||||
SSE2_XSawp qdq, %1, %8, %3
|
||||
SSE2_XSawp qdq, %4, %2, %8
|
||||
SSE2_XSawp qdq, %6, %7, %2
|
||||
movdqa %7, %10
|
||||
movdqa %10, %1
|
||||
SSE2_XSawp qdq, %7, %5, %1
|
||||
movdqa %5, %10
|
||||
%endmacro ; end of TRANSPOSE_8x16B_SSE2
|
||||
|
||||
|
||||
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
|
||||
movq [%1], xmm4 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], xmm2 ; result of line 2
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], xmm3 ; result of line 3
|
||||
movq [%1+%2], xmm7 ; result of line 4
|
||||
%macro TRANSPOSE8x16_WRITE_SSE2 2 ; dst, dst_stride
|
||||
movq [%1], xmm4 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], xmm2 ; result of line 2
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], xmm3 ; result of line 3
|
||||
movq [%1+%2], xmm7 ; result of line 4
|
||||
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], xmm5 ; result of line 5
|
||||
movq [%1+%2], xmm1 ; result of line 6
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], xmm6 ; result of line 7
|
||||
movq [%1+%2], xmm0 ; result of line 8
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], xmm5 ; result of line 5
|
||||
movq [%1+%2], xmm1 ; result of line 6
|
||||
lea %1, [%1+2*%2]
|
||||
movq [%1], xmm6 ; result of line 7
|
||||
movq [%1+%2], xmm0 ; result of line 8
|
||||
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm4 ; result of line 9
|
||||
movhpd [%1+%2], xmm2 ; result of line 10
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm3 ; result of line 11
|
||||
movhpd [%1+%2], xmm7 ; result of line 12
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm4 ; result of line 9
|
||||
movhpd [%1+%2], xmm2 ; result of line 10
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm3 ; result of line 11
|
||||
movhpd [%1+%2], xmm7 ; result of line 12
|
||||
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm5 ; result of line 13
|
||||
movhpd [%1+%2], xmm1 ; result of line 14
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm6 ; result of line 15
|
||||
movhpd [%1+%2], xmm0 ; result of line 16
|
||||
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm5 ; result of line 13
|
||||
movhpd [%1+%2], xmm1 ; result of line 14
|
||||
lea %1, [%1+2*%2]
|
||||
movhpd [%1], xmm6 ; result of line 15
|
||||
movhpd [%1+%2], xmm0 ; result of line 16
|
||||
%endmacro ; end of TRANSPOSE_WRITE_RESULT_SSE2
|
||||
|
||||
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
|
||||
movq [%1], xmm4 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], xmm2 ; result of line 2
|
||||
lea %3, [%1+2*%2]
|
||||
movq [%3], xmm3 ; result of line 3
|
||||
movq [%3+%2], xmm7 ; result of line 4
|
||||
%macro TRANSPOSE8x16_WRITE_ALT_SSE2 3 ; dst, dst_stride, reg32
|
||||
movq [%1], xmm4 ; result of line 1, x8 bytes
|
||||
movq [%1+%2], xmm2 ; result of line 2
|
||||
lea %3, [%1+2*%2]
|
||||
movq [%3], xmm3 ; result of line 3
|
||||
movq [%3+%2], xmm7 ; result of line 4
|
||||
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], xmm5 ; result of line 5
|
||||
movq [%3+%2], xmm1 ; result of line 6
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], xmm6 ; result of line 7
|
||||
movq [%3+%2], xmm0 ; result of line 8
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], xmm5 ; result of line 5
|
||||
movq [%3+%2], xmm1 ; result of line 6
|
||||
lea %3, [%3+2*%2]
|
||||
movq [%3], xmm6 ; result of line 7
|
||||
movq [%3+%2], xmm0 ; result of line 8
|
||||
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm4 ; result of line 9
|
||||
movhpd [%3+%2], xmm2 ; result of line 10
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm3 ; result of line 11
|
||||
movhpd [%3+%2], xmm7 ; result of line 12
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm4 ; result of line 9
|
||||
movhpd [%3+%2], xmm2 ; result of line 10
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm3 ; result of line 11
|
||||
movhpd [%3+%2], xmm7 ; result of line 12
|
||||
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm5 ; result of line 13
|
||||
movhpd [%3+%2], xmm1 ; result of line 14
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm6 ; result of line 15
|
||||
movhpd [%3+%2], xmm0 ; result of line 16
|
||||
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm5 ; result of line 13
|
||||
movhpd [%3+%2], xmm1 ; result of line 14
|
||||
lea %3, [%3+2*%2]
|
||||
movhpd [%3], xmm6 ; result of line 15
|
||||
movhpd [%3+%2], xmm0 ; result of line 16
|
||||
%endmacro ; end of TRANSPOSE8x16_WRITE_ALT_SSE2
|
||||
|
||||
|
||||
SECTION .text
|
||||
|
||||
WELS_EXTERN TransposeMatrixBlock16x16_sse2
|
||||
; void TransposeMatrixBlock16x16_sse2( void *dst/*16x16*/, const int32_t dst_stride, void *src/*16x16*/, const int32_t src_stride );
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
push r4
|
||||
push r5
|
||||
%assign push_num 2
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
|
||||
mov r4, r7
|
||||
and r4, 0Fh
|
||||
sub r7, 10h
|
||||
sub r7, r4
|
||||
lea r5, [r3+r3*2]
|
||||
; top 8x16 block
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+r3*2]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+r3*4]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+r3*2]
|
||||
mov r4, r7
|
||||
and r4, 0Fh
|
||||
sub r7, 10h
|
||||
sub r7, r4
|
||||
lea r5, [r3+r3*2]
|
||||
; top 8x16 block
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+r3*2]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+r3*4]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+r3*2]
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
|
||||
|
||||
TRANSPOSE8x16_WRITE_SSE2 r0, r1
|
||||
TRANSPOSE8x16_WRITE_SSE2 r0, r1
|
||||
|
||||
; bottom 8x16 block
|
||||
lea r2, [r2+r3*4]
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+r3*2]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+r3*4]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+r3*2]
|
||||
; bottom 8x16 block
|
||||
lea r2, [r2+r3*4]
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
movdqa xmm2, [r2+r3*2]
|
||||
movdqa xmm3, [r2+r5]
|
||||
lea r2, [r2+r3*4]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
movdqa xmm6, [r2+r3*2]
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r5], [r7]
|
||||
|
||||
mov r5, r1
|
||||
sal r5, 4
|
||||
sub r0, r5
|
||||
lea r0, [r0+r1*2+8]
|
||||
TRANSPOSE8x16_WRITE_SSE2 r0, r1
|
||||
mov r5, r1
|
||||
sal r5, 4
|
||||
sub r0, r5
|
||||
lea r0, [r0+r1*2+8]
|
||||
TRANSPOSE8x16_WRITE_SSE2 r0, r1
|
||||
|
||||
add r7, r4
|
||||
add r7, 10h
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
add r7, r4
|
||||
add r7, 10h
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
pop r5
|
||||
pop r4
|
||||
ret
|
||||
|
||||
WELS_EXTERN TransposeMatrixBlocksx16_sse2
|
||||
; void TransposeMatrixBlocksx16_sse2( void *dst/*W16x16*/, const int32_t dst_stride, void *src/*16xW16*/, const int32_t src_stride, const int32_t num_blocks );
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num 2
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
mov r5, r7
|
||||
and r5, 0Fh
|
||||
sub r7, 10h
|
||||
sub r7, r5
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num 2
|
||||
LOAD_5_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
mov r5, r7
|
||||
and r5, 0Fh
|
||||
sub r7, 10h
|
||||
sub r7, r5
|
||||
TRANSPOSE_LOOP_SSE2:
|
||||
; explictly loading next loop data
|
||||
lea r6, [r2+r3*8]
|
||||
push r4
|
||||
; explictly loading next loop data
|
||||
lea r6, [r2+r3*8]
|
||||
push r4
|
||||
%rep 8
|
||||
mov r4, [r6]
|
||||
mov r4, [r6+r3]
|
||||
lea r6, [r6+r3*2]
|
||||
mov r4, [r6]
|
||||
mov r4, [r6+r3]
|
||||
lea r6, [r6+r3*2]
|
||||
%endrep
|
||||
pop r4
|
||||
; top 8x16 block
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm2, [r2]
|
||||
movdqa xmm3, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm6, [r2]
|
||||
pop r4
|
||||
; top 8x16 block
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm2, [r2]
|
||||
movdqa xmm3, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm6, [r2]
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
|
||||
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
|
||||
lea r2, [r2+r3*2]
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
|
||||
TRANSPOSE8x16_WRITE_ALT_SSE2 r0, r1, r6
|
||||
lea r2, [r2+r3*2]
|
||||
|
||||
; bottom 8x16 block
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm2, [r2]
|
||||
movdqa xmm3, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm6, [r2]
|
||||
; bottom 8x16 block
|
||||
movdqa xmm0, [r2]
|
||||
movdqa xmm1, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm2, [r2]
|
||||
movdqa xmm3, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm4, [r2]
|
||||
movdqa xmm5, [r2+r3]
|
||||
lea r2, [r2+r3*2]
|
||||
movdqa xmm6, [r2]
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
|
||||
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
|
||||
lea r2, [r2+r3*2]
|
||||
lea r0, [r0+16]
|
||||
dec r4
|
||||
jg near TRANSPOSE_LOOP_SSE2
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m4, m2, m3, m7, m5, m1, m6, m0
|
||||
TRANSPOSE_8x16B_SSE2 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, [r2+r3], [r7]
|
||||
TRANSPOSE8x16_WRITE_ALT_SSE2 r0+8, r1, r6
|
||||
lea r2, [r2+r3*2]
|
||||
lea r0, [r0+16]
|
||||
dec r4
|
||||
jg near TRANSPOSE_LOOP_SSE2
|
||||
|
||||
add r7, r5
|
||||
add r7, 10h
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
pop r6
|
||||
pop r5
|
||||
ret
|
||||
add r7, r5
|
||||
add r7, 10h
|
||||
POP_XMM
|
||||
LOAD_5_PARA_POP
|
||||
pop r6
|
||||
pop r5
|
||||
ret
|
||||
|
||||
WELS_EXTERN TransposeMatrixBlock8x8_mmx
|
||||
; void TransposeMatrixBlock8x8_mmx( void *dst/*8x8*/, const int32_t dst_stride, void *src/*8x8*/, const int32_t src_stride );
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
sub r7, 8
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
sub r7, 8
|
||||
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
|
||||
|
||||
TRANSPOSE8x8_WRITE_MMX r0, r1
|
||||
TRANSPOSE8x8_WRITE_MMX r0, r1
|
||||
|
||||
emms
|
||||
add r7, 8
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
emms
|
||||
add r7, 8
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
WELS_EXTERN TransposeMatrixBlocksx8_mmx
|
||||
; void TransposeMatrixBlocksx8_mmx( void *dst/*8xW8*/, const int32_t dst_stride, void *src/*W8x8*/, const int32_t src_stride, const int32_t num_blocks );
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num 2
|
||||
LOAD_5_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
sub r7, 8
|
||||
push r5
|
||||
push r6
|
||||
%assign push_num 2
|
||||
LOAD_5_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
SIGN_EXTENSION r4, r4d
|
||||
sub r7, 8
|
||||
|
||||
lea r5, [r2+r3*8]
|
||||
lea r5, [r2+r3*8]
|
||||
|
||||
TRANSPOSE_BLOCKS_X8_LOOP_MMX:
|
||||
; explictly loading next loop data
|
||||
; explictly loading next loop data
|
||||
%rep 4
|
||||
mov r6, [r5]
|
||||
mov r6, [r5+r3]
|
||||
lea r5, [r5+r3*2]
|
||||
mov r6, [r5]
|
||||
mov r6, [r5+r3]
|
||||
lea r5, [r5+r3*2]
|
||||
%endrep
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
movq mm0, [r2]
|
||||
movq mm1, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm2, [r2]
|
||||
movq mm3, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm4, [r2]
|
||||
movq mm5, [r2+r3]
|
||||
lea r2, [r2+2*r3]
|
||||
movq mm6, [r2]
|
||||
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
|
||||
;in: m0, m1, m2, m3, m4, m5, m6, m7
|
||||
;out: m0, m3, m5, m2, m7, m1, m6, m4
|
||||
TRANSPOSE_8x8B_MMX mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, [r2+r3], [r7]
|
||||
|
||||
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
|
||||
lea r0, [r0+8]
|
||||
lea r2, [r2+2*r3]
|
||||
dec r4
|
||||
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
|
||||
TRANSPOSE8x8_WRITE_ALT_MMX r0, r1, r6
|
||||
lea r0, [r0+8]
|
||||
lea r2, [r2+2*r3]
|
||||
dec r4
|
||||
jg near TRANSPOSE_BLOCKS_X8_LOOP_MMX
|
||||
|
||||
emms
|
||||
add r7, 8
|
||||
LOAD_5_PARA_POP
|
||||
pop r6
|
||||
pop r5
|
||||
ret
|
||||
emms
|
||||
add r7, 8
|
||||
LOAD_5_PARA_POP
|
||||
pop r6
|
||||
pop r5
|
||||
ret
|
||||
|
@ -51,10 +51,10 @@ SECTION .text
|
||||
;void WelsPrefetchZero_mmx(int8_t const*_A);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsPrefetchZero_mmx
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
prefetchnta [r0]
|
||||
ret
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
prefetchnta [r0]
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
@ -62,71 +62,71 @@ WELS_EXTERN WelsPrefetchZero_mmx
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsSetMemZeroAligned64_sse2
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
neg r1
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
neg r1
|
||||
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm0, xmm0
|
||||
.memzeroa64_sse2_loops:
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+16], xmm0
|
||||
movdqa [r0+32], xmm0
|
||||
movdqa [r0+48], xmm0
|
||||
add r0, 0x40
|
||||
movdqa [r0], xmm0
|
||||
movdqa [r0+16], xmm0
|
||||
movdqa [r0+32], xmm0
|
||||
movdqa [r0+48], xmm0
|
||||
add r0, 0x40
|
||||
|
||||
add r1, 0x40
|
||||
jnz near .memzeroa64_sse2_loops
|
||||
add r1, 0x40
|
||||
jnz near .memzeroa64_sse2_loops
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsSetMemZeroSize64_mmx(void *dst, int32_t size)
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsSetMemZeroSize64_mmx
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
neg r1
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
neg r1
|
||||
|
||||
pxor mm0, mm0
|
||||
pxor mm0, mm0
|
||||
.memzero64_mmx_loops:
|
||||
movq [r0], mm0
|
||||
movq [r0+8], mm0
|
||||
movq [r0+16], mm0
|
||||
movq [r0+24], mm0
|
||||
movq [r0+32], mm0
|
||||
movq [r0+40], mm0
|
||||
movq [r0+48], mm0
|
||||
movq [r0+56], mm0
|
||||
add r0, 0x40
|
||||
movq [r0], mm0
|
||||
movq [r0+8], mm0
|
||||
movq [r0+16], mm0
|
||||
movq [r0+24], mm0
|
||||
movq [r0+32], mm0
|
||||
movq [r0+40], mm0
|
||||
movq [r0+48], mm0
|
||||
movq [r0+56], mm0
|
||||
add r0, 0x40
|
||||
|
||||
add r1, 0x40
|
||||
jnz near .memzero64_mmx_loops
|
||||
add r1, 0x40
|
||||
jnz near .memzero64_mmx_loops
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsSetMemZeroSize8_mmx(void *dst, int32_t size)
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsSetMemZeroSize8_mmx
|
||||
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
neg r1
|
||||
pxor mm0, mm0
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
SIGN_EXTENSION r1, r1d
|
||||
neg r1
|
||||
pxor mm0, mm0
|
||||
|
||||
.memzero8_mmx_loops:
|
||||
movq [r0], mm0
|
||||
add r0, 0x08
|
||||
movq [r0], mm0
|
||||
add r0, 0x08
|
||||
|
||||
add r1, 0x08
|
||||
jnz near .memzero8_mmx_loops
|
||||
add r1, 0x08
|
||||
jnz near .memzero8_mmx_loops
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
|
||||
|
@ -49,241 +49,241 @@ SECTION .text
|
||||
;************************************************
|
||||
|
||||
%macro SSE2_Quant8 5
|
||||
MOVDQ %1, %5
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
paddusw %1, %3
|
||||
pmulhuw %1, %4
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
MOVDQ %5, %1
|
||||
MOVDQ %1, %5
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
paddusw %1, %3
|
||||
pmulhuw %1, %4
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
MOVDQ %5, %1
|
||||
%endmacro
|
||||
|
||||
%macro SSE2_QuantMax8 6
|
||||
MOVDQ %1, %5
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
paddusw %1, %3
|
||||
pmulhuw %1, %4
|
||||
pmaxsw %6, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
MOVDQ %5, %1
|
||||
MOVDQ %1, %5
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
paddusw %1, %3
|
||||
pmulhuw %1, %4
|
||||
pmaxsw %6, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
MOVDQ %5, %1
|
||||
%endmacro
|
||||
|
||||
%define pDct esp + 4
|
||||
%define ff esp + 8
|
||||
%define mf esp + 12
|
||||
%define max esp + 16
|
||||
%define pDct esp + 4
|
||||
%define ff esp + 8
|
||||
%define mf esp + 12
|
||||
%define max esp + 16
|
||||
;***********************************************************************
|
||||
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||
; void WelsQuant4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsQuant4x4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
movdqa xmm2, [r1]
|
||||
movdqa xmm3, [r2]
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
movdqa xmm2, [r1]
|
||||
movdqa xmm3, [r2]
|
||||
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
;void WelsQuant4x4Dc_sse2(int16_t *pDct, const int16_t ff, int16_t mf);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsQuant4x4Dc_sse2
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
SIGN_EXTENSIONW r1, r1w
|
||||
SIGN_EXTENSIONW r2, r2w
|
||||
SSE2_Copy8Times xmm3, r2d
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
SIGN_EXTENSIONW r1, r1w
|
||||
SIGN_EXTENSIONW r2, r2w
|
||||
SSE2_Copy8Times xmm3, r2d
|
||||
|
||||
SSE2_Copy8Times xmm2, r1d
|
||||
SSE2_Copy8Times xmm2, r1d
|
||||
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||
; void WelsQuantFour4x4_sse2(int16_t *pDct, int16_t* ff, int16_t *mf);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsQuantFour4x4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
MOVDQ xmm2, [r1]
|
||||
MOVDQ xmm3, [r2]
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
MOVDQ xmm2, [r1]
|
||||
MOVDQ xmm3, [r2]
|
||||
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60]
|
||||
SSE2_Quant8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70]
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
|
||||
; void WelsQuantFour4x4Max_sse2(int16_t *pDct, int32_t* f, int16_t *mf, int16_t *max);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsQuantFour4x4Max_sse2
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
MOVDQ xmm2, [r1]
|
||||
MOVDQ xmm3, [r2]
|
||||
%assign push_num 0
|
||||
LOAD_4_PARA
|
||||
PUSH_XMM 8
|
||||
MOVDQ xmm2, [r1]
|
||||
MOVDQ xmm3, [r2]
|
||||
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm6, xmm6
|
||||
pxor xmm7, xmm7
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
|
||||
pxor xmm4, xmm4
|
||||
pxor xmm5, xmm5
|
||||
pxor xmm6, xmm6
|
||||
pxor xmm7, xmm7
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 ], xmm4
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x10], xmm4
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x20], xmm5
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x30], xmm5
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x40], xmm6
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x50], xmm6
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x60], xmm7
|
||||
SSE2_QuantMax8 xmm0, xmm1, xmm2, xmm3, [r0 + 0x70], xmm7
|
||||
|
||||
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
|
||||
pmaxsw xmm0, xmm4
|
||||
pmaxsw xmm0, xmm5
|
||||
pmaxsw xmm0, xmm7
|
||||
movdqa xmm1, xmm0
|
||||
punpckhqdq xmm0, xmm1
|
||||
pmaxsw xmm0, xmm1
|
||||
SSE2_TransTwo4x4W xmm4, xmm5, xmm6, xmm7, xmm0
|
||||
pmaxsw xmm0, xmm4
|
||||
pmaxsw xmm0, xmm5
|
||||
pmaxsw xmm0, xmm7
|
||||
movdqa xmm1, xmm0
|
||||
punpckhqdq xmm0, xmm1
|
||||
pmaxsw xmm0, xmm1
|
||||
|
||||
movq [r3], xmm0
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
movq [r3], xmm0
|
||||
POP_XMM
|
||||
LOAD_4_PARA_POP
|
||||
ret
|
||||
|
||||
%macro MMX_Copy4Times 2
|
||||
movd %1, %2
|
||||
punpcklwd %1, %1
|
||||
punpckldq %1, %1
|
||||
movd %1, %2
|
||||
punpcklwd %1, %1
|
||||
punpckldq %1, %1
|
||||
%endmacro
|
||||
|
||||
SECTION .text
|
||||
|
||||
%macro MMX_Quant4 4
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
paddusw %1, %3
|
||||
pmulhuw %1, %4
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
pxor %2, %2
|
||||
pcmpgtw %2, %1
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
paddusw %1, %3
|
||||
pmulhuw %1, %4
|
||||
pxor %1, %2
|
||||
psubw %1, %2
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
;int32_t WelsHadamardQuant2x2_mmx(int16_t *rs, const int16_t ff, int16_t mf, int16_t * pDct, int16_t * block);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsHadamardQuant2x2_mmx
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
SIGN_EXTENSIONW r1, r1w
|
||||
SIGN_EXTENSIONW r2, r2w
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0 + 0x20]
|
||||
punpcklwd mm0, mm1
|
||||
movd mm3, [r0 + 0x40]
|
||||
movd mm1, [r0 + 0x60]
|
||||
punpcklwd mm3, mm1
|
||||
%assign push_num 0
|
||||
LOAD_5_PARA
|
||||
SIGN_EXTENSIONW r1, r1w
|
||||
SIGN_EXTENSIONW r2, r2w
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0 + 0x20]
|
||||
punpcklwd mm0, mm1
|
||||
movd mm3, [r0 + 0x40]
|
||||
movd mm1, [r0 + 0x60]
|
||||
punpcklwd mm3, mm1
|
||||
|
||||
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
|
||||
movq mm5, mm3
|
||||
paddw mm3, mm0
|
||||
psubw mm0, mm5
|
||||
punpcklwd mm3, mm0
|
||||
movq mm1, mm3
|
||||
psrlq mm1, 32
|
||||
movq mm5, mm1
|
||||
paddw mm1, mm3
|
||||
psubw mm3, mm5
|
||||
punpcklwd mm1, mm3
|
||||
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
|
||||
movq mm5, mm3
|
||||
paddw mm3, mm0
|
||||
psubw mm0, mm5
|
||||
punpcklwd mm3, mm0
|
||||
movq mm1, mm3
|
||||
psrlq mm1, 32
|
||||
movq mm5, mm1
|
||||
paddw mm1, mm3
|
||||
psubw mm3, mm5
|
||||
punpcklwd mm1, mm3
|
||||
|
||||
;quant_2x2_dc
|
||||
MMX_Copy4Times mm3, r2d
|
||||
MMX_Copy4Times mm2, r1d
|
||||
MMX_Quant4 mm1, mm0, mm2, mm3
|
||||
;quant_2x2_dc
|
||||
MMX_Copy4Times mm3, r2d
|
||||
MMX_Copy4Times mm2, r1d
|
||||
MMX_Quant4 mm1, mm0, mm2, mm3
|
||||
|
||||
; store dct_2x2
|
||||
movq [r3], mm1
|
||||
movq [r4], mm1
|
||||
; store dct_2x2
|
||||
movq [r3], mm1
|
||||
movq [r4], mm1
|
||||
|
||||
; pNonZeroCount of dct_2x2
|
||||
pcmpeqb mm2, mm2 ; mm2 = FF
|
||||
pxor mm3, mm3
|
||||
packsswb mm1, mm3
|
||||
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
|
||||
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
|
||||
psadbw mm1, mm3 ;
|
||||
mov r1w, 0
|
||||
mov [r0], r1w
|
||||
mov [r0 + 0x20], r1w
|
||||
mov [r0 + 0x40], r1w
|
||||
mov [r0 + 0x60], r1w
|
||||
; pNonZeroCount of dct_2x2
|
||||
pcmpeqb mm2, mm2 ; mm2 = FF
|
||||
pxor mm3, mm3
|
||||
packsswb mm1, mm3
|
||||
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
|
||||
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
|
||||
psadbw mm1, mm3 ;
|
||||
mov r1w, 0
|
||||
mov [r0], r1w
|
||||
mov [r0 + 0x20], r1w
|
||||
mov [r0 + 0x40], r1w
|
||||
mov [r0 + 0x60], r1w
|
||||
|
||||
|
||||
movd retrd, mm1
|
||||
movd retrd, mm1
|
||||
|
||||
WELSEMMS
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
WELSEMMS
|
||||
LOAD_5_PARA_POP
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
;int32_t WelsHadamardQuant2x2Skip_mmx(int16_t *pDct, int16_t ff, int16_t mf);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
SIGN_EXTENSIONW r1, r1w
|
||||
SIGN_EXTENSIONW r2, r2w
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0 + 0x20]
|
||||
punpcklwd mm0, mm1
|
||||
movd mm3, [r0 + 0x40]
|
||||
movd mm1, [r0 + 0x60]
|
||||
punpcklwd mm3, mm1
|
||||
%assign push_num 0
|
||||
LOAD_3_PARA
|
||||
SIGN_EXTENSIONW r1, r1w
|
||||
SIGN_EXTENSIONW r2, r2w
|
||||
movd mm0, [r0]
|
||||
movd mm1, [r0 + 0x20]
|
||||
punpcklwd mm0, mm1
|
||||
movd mm3, [r0 + 0x40]
|
||||
movd mm1, [r0 + 0x60]
|
||||
punpcklwd mm3, mm1
|
||||
|
||||
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
|
||||
movq mm5, mm3
|
||||
paddw mm3, mm0
|
||||
psubw mm0, mm5
|
||||
punpcklwd mm3, mm0
|
||||
movq mm1, mm3
|
||||
psrlq mm1, 32
|
||||
movq mm5, mm1
|
||||
paddw mm1, mm3
|
||||
psubw mm3, mm5
|
||||
punpcklwd mm1, mm3
|
||||
;hdm_2x2, mm0 = dct0 dct1, mm3 = dct2 dct3
|
||||
movq mm5, mm3
|
||||
paddw mm3, mm0
|
||||
psubw mm0, mm5
|
||||
punpcklwd mm3, mm0
|
||||
movq mm1, mm3
|
||||
psrlq mm1, 32
|
||||
movq mm5, mm1
|
||||
paddw mm1, mm3
|
||||
psubw mm3, mm5
|
||||
punpcklwd mm1, mm3
|
||||
|
||||
;quant_2x2_dc
|
||||
MMX_Copy4Times mm3, r2d
|
||||
MMX_Copy4Times mm2, r1d
|
||||
MMX_Quant4 mm1, mm0, mm2, mm3
|
||||
;quant_2x2_dc
|
||||
MMX_Copy4Times mm3, r2d
|
||||
MMX_Copy4Times mm2, r1d
|
||||
MMX_Quant4 mm1, mm0, mm2, mm3
|
||||
|
||||
; pNonZeroCount of dct_2x2
|
||||
pcmpeqb mm2, mm2 ; mm2 = FF
|
||||
pxor mm3, mm3
|
||||
packsswb mm1, mm3
|
||||
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
|
||||
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
|
||||
psadbw mm1, mm3 ;
|
||||
movd retrd, mm1
|
||||
; pNonZeroCount of dct_2x2
|
||||
pcmpeqb mm2, mm2 ; mm2 = FF
|
||||
pxor mm3, mm3
|
||||
packsswb mm1, mm3
|
||||
pcmpeqb mm1, mm3 ; set FF if equal, 0 if not equal
|
||||
psubsb mm1, mm2 ; set 0 if equal, 1 if not equal
|
||||
psadbw mm1, mm3 ;
|
||||
movd retrd, mm1
|
||||
|
||||
WELSEMMS
|
||||
ret
|
||||
WELSEMMS
|
||||
ret
|
||||
|
||||
|
||||
%macro SSE2_DeQuant8 3
|
||||
@ -297,12 +297,12 @@ WELS_EXTERN WelsHadamardQuant2x2Skip_mmx
|
||||
; void WelsDequant4x4_sse2(int16_t *pDct, const uint16_t* mf);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsDequant4x4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
|
||||
movdqa xmm1, [r1]
|
||||
SSE2_DeQuant8 [r0 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
|
||||
movdqa xmm1, [r1]
|
||||
SSE2_DeQuant8 [r0 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0 + 0x10], xmm0, xmm1
|
||||
|
||||
ret
|
||||
|
||||
@ -311,18 +311,18 @@ WELS_EXTERN WelsDequant4x4_sse2
|
||||
;***********************************************************************====
|
||||
|
||||
WELS_EXTERN WelsDequantFour4x4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
|
||||
movdqa xmm1, [r1]
|
||||
SSE2_DeQuant8 [r0 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
|
||||
movdqa xmm1, [r1]
|
||||
SSE2_DeQuant8 [r0 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x10 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x20 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x30 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x40 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x50 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x60 ], xmm0, xmm1
|
||||
SSE2_DeQuant8 [r0+0x70 ], xmm0, xmm1
|
||||
|
||||
ret
|
||||
|
||||
@ -330,41 +330,41 @@ WELS_EXTERN WelsDequantFour4x4_sse2
|
||||
;void WelsDequantIHadamard4x4_sse2(int16_t *rs, const uint16_t mf);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsDequantIHadamard4x4_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
%ifndef X86_32
|
||||
movzx r1, r1w
|
||||
%endif
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
%ifndef X86_32
|
||||
movzx r1, r1w
|
||||
%endif
|
||||
|
||||
; WelsDequantLumaDc4x4
|
||||
SSE2_Copy8Times xmm1, r1d
|
||||
;psrlw xmm1, 2 ; for the (>>2) in ihdm
|
||||
MOVDQ xmm0, [r0]
|
||||
MOVDQ xmm2, [r0+0x10]
|
||||
pmullw xmm0, xmm1
|
||||
pmullw xmm2, xmm1
|
||||
; WelsDequantLumaDc4x4
|
||||
SSE2_Copy8Times xmm1, r1d
|
||||
;psrlw xmm1, 2 ; for the (>>2) in ihdm
|
||||
MOVDQ xmm0, [r0]
|
||||
MOVDQ xmm2, [r0+0x10]
|
||||
pmullw xmm0, xmm1
|
||||
pmullw xmm2, xmm1
|
||||
|
||||
; ihdm_4x4
|
||||
movdqa xmm1, xmm0
|
||||
psrldq xmm1, 8
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 8
|
||||
; ihdm_4x4
|
||||
movdqa xmm1, xmm0
|
||||
psrldq xmm1, 8
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm3, 8
|
||||
|
||||
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
|
||||
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
|
||||
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
|
||||
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
|
||||
SSE2_SumSub xmm0, xmm3, xmm5 ; xmm0 = xmm0 - xmm3, xmm3 = xmm0 + xmm3
|
||||
SSE2_SumSub xmm1, xmm2, xmm5 ; xmm1 = xmm1 - xmm2, xmm2 = xmm1 + xmm2
|
||||
SSE2_SumSub xmm3, xmm2, xmm5 ; xmm3 = xmm3 - xmm2, xmm2 = xmm3 + xmm2
|
||||
SSE2_SumSub xmm0, xmm1, xmm5 ; xmm0 = xmm0 - xmm1, xmm1 = xmm0 + xmm1
|
||||
|
||||
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
|
||||
SSE2_SumSub xmm2, xmm4, xmm5
|
||||
SSE2_SumSub xmm1, xmm0, xmm5
|
||||
SSE2_SumSub xmm4, xmm0, xmm5
|
||||
SSE2_SumSub xmm2, xmm1, xmm5
|
||||
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
||||
SSE2_TransTwo4x4W xmm2, xmm1, xmm3, xmm0, xmm4
|
||||
SSE2_SumSub xmm2, xmm4, xmm5
|
||||
SSE2_SumSub xmm1, xmm0, xmm5
|
||||
SSE2_SumSub xmm4, xmm0, xmm5
|
||||
SSE2_SumSub xmm2, xmm1, xmm5
|
||||
SSE2_TransTwo4x4W xmm0, xmm1, xmm4, xmm2, xmm3
|
||||
|
||||
punpcklqdq xmm0, xmm1
|
||||
MOVDQ [r0], xmm0
|
||||
punpcklqdq xmm0, xmm1
|
||||
MOVDQ [r0], xmm0
|
||||
|
||||
punpcklqdq xmm2, xmm3
|
||||
MOVDQ [r0+16], xmm2
|
||||
ret
|
||||
punpcklqdq xmm2, xmm3
|
||||
MOVDQ [r0+16], xmm2
|
||||
ret
|
||||
|
@ -35,189 +35,189 @@ SECTION .text
|
||||
|
||||
;**********************************************************************************************************************************
|
||||
;
|
||||
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
|
||||
; uint32_t SampleSad16x16Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16 base_cost[8], int32_t *index_min_cost )
|
||||
;
|
||||
; \note:
|
||||
; src need align with 16 bytes, ref is optional
|
||||
; \return value:
|
||||
; return minimal SAD cost, according index carried by index_min_cost
|
||||
; \note:
|
||||
; src need align with 16 bytes, ref is optional
|
||||
; \return value:
|
||||
; return minimal SAD cost, according index carried by index_min_cost
|
||||
;**********************************************************************************************************************************
|
||||
; try 8 mv via offset
|
||||
; xmm7 store sad costs
|
||||
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
||||
movdqa xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqu xmm2, [%2+8h]
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
%macro SAD_16x16_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
||||
movdqa xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqu xmm2, [%2+8h]
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
|
||||
mpsadbw xmm3, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm3 ; accumulate cost
|
||||
mpsadbw xmm3, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm3 ; accumulate cost
|
||||
|
||||
mpsadbw xmm2, xmm0, 2 ; 010 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
mpsadbw xmm2, xmm0, 2 ; 010 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
|
||||
mpsadbw xmm4, xmm0, 7 ; 111 B
|
||||
paddw xmm7, xmm4 ; accumulate cost
|
||||
mpsadbw xmm4, xmm0, 7 ; 111 B
|
||||
paddw xmm7, xmm4 ; accumulate cost
|
||||
|
||||
add %1, %3
|
||||
add %2, %4
|
||||
%endmacro ; end of SAD_16x16_LINE_SSE41
|
||||
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
||||
movdqa xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqu xmm2, [%2+8h]
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
add %1, %3
|
||||
add %2, %4
|
||||
%endmacro ; end of SAD_16x16_LINE_SSE41
|
||||
%macro SAD_16x16_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
||||
movdqa xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqu xmm2, [%2+8h]
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm4, xmm2
|
||||
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
|
||||
mpsadbw xmm3, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm3 ; accumulate cost
|
||||
mpsadbw xmm3, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm3 ; accumulate cost
|
||||
|
||||
mpsadbw xmm2, xmm0, 2 ; 010 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
mpsadbw xmm2, xmm0, 2 ; 010 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
|
||||
mpsadbw xmm4, xmm0, 7 ; 111 B
|
||||
paddw xmm7, xmm4 ; accumulate cost
|
||||
%endmacro ; end of SAD_16x16_LINE_SSE41E
|
||||
mpsadbw xmm4, xmm0, 7 ; 111 B
|
||||
paddw xmm7, xmm4 ; accumulate cost
|
||||
%endmacro ; end of SAD_16x16_LINE_SSE41E
|
||||
|
||||
WELS_EXTERN SampleSad16x16Hor8_sse41
|
||||
;push ebx
|
||||
;push esi
|
||||
;mov eax, [esp+12] ; src
|
||||
;mov ecx, [esp+16] ; stride_src
|
||||
;mov ebx, [esp+20] ; ref
|
||||
;mov edx, [esp+24] ; stride_ref
|
||||
;mov esi, [esp+28] ; base_cost
|
||||
;mov eax, [esp+12] ; src
|
||||
;mov ecx, [esp+16] ; stride_src
|
||||
;mov ebx, [esp+20] ; ref
|
||||
;mov edx, [esp+24] ; stride_ref
|
||||
;mov esi, [esp+28] ; base_cost
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm7, xmm7
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
pxor xmm7, xmm7
|
||||
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_16x16_LINE_SSE41E r0, r2, r1, r3
|
||||
|
||||
pxor xmm0, xmm0
|
||||
movdqa xmm6, xmm7
|
||||
punpcklwd xmm6, xmm0
|
||||
punpckhwd xmm7, xmm0
|
||||
pxor xmm0, xmm0
|
||||
movdqa xmm6, xmm7
|
||||
punpcklwd xmm6, xmm0
|
||||
punpckhwd xmm7, xmm0
|
||||
|
||||
movdqa xmm5, [r4]
|
||||
movdqa xmm4, xmm5
|
||||
punpcklwd xmm4, xmm0
|
||||
punpckhwd xmm5, xmm0
|
||||
movdqa xmm5, [r4]
|
||||
movdqa xmm4, xmm5
|
||||
punpcklwd xmm4, xmm0
|
||||
punpckhwd xmm5, xmm0
|
||||
|
||||
paddd xmm4, xmm6
|
||||
paddd xmm5, xmm7
|
||||
movdqa xmm3, xmm4
|
||||
pminud xmm3, xmm5
|
||||
pshufd xmm2, xmm3, 01001110B
|
||||
pminud xmm2, xmm3
|
||||
pshufd xmm3, xmm2, 10110001B
|
||||
pminud xmm2, xmm3
|
||||
movd retrd, xmm2
|
||||
pcmpeqd xmm4, xmm2
|
||||
movmskps r2d, xmm4
|
||||
bsf r1d, r2d
|
||||
jnz near WRITE_INDEX
|
||||
paddd xmm4, xmm6
|
||||
paddd xmm5, xmm7
|
||||
movdqa xmm3, xmm4
|
||||
pminud xmm3, xmm5
|
||||
pshufd xmm2, xmm3, 01001110B
|
||||
pminud xmm2, xmm3
|
||||
pshufd xmm3, xmm2, 10110001B
|
||||
pminud xmm2, xmm3
|
||||
movd retrd, xmm2
|
||||
pcmpeqd xmm4, xmm2
|
||||
movmskps r2d, xmm4
|
||||
bsf r1d, r2d
|
||||
jnz near WRITE_INDEX
|
||||
|
||||
pcmpeqd xmm5, xmm2
|
||||
movmskps r2d, xmm5
|
||||
bsf r1d, r2d
|
||||
add r1d, 4
|
||||
pcmpeqd xmm5, xmm2
|
||||
movmskps r2d, xmm5
|
||||
bsf r1d, r2d
|
||||
add r1d, 4
|
||||
|
||||
WRITE_INDEX:
|
||||
mov [r5], r1d
|
||||
mov [r5], r1d
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
ret
|
||||
|
||||
;**********************************************************************************************************************************
|
||||
;
|
||||
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
|
||||
; uint32_t SampleSad8x8Hor8_sse41( uint8_t *src, int32_t stride_src, uint8_t *ref, int32_t stride_ref, uint16_t base_cost[8], int32_t *index_min_cost )
|
||||
;
|
||||
; \note:
|
||||
; src and ref is optional to align with 16 due inter 8x8
|
||||
; \return value:
|
||||
; return minimal SAD cost, according index carried by index_min_cost
|
||||
; \note:
|
||||
; src and ref is optional to align with 16 due inter 8x8
|
||||
; \return value:
|
||||
; return minimal SAD cost, according index carried by index_min_cost
|
||||
;
|
||||
;**********************************************************************************************************************************
|
||||
; try 8 mv via offset
|
||||
; xmm7 store sad costs
|
||||
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
||||
movdqu xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqa xmm2, xmm1
|
||||
%macro SAD_8x8_LINE_SSE41 4 ; src, ref, stride_src, stride_ref
|
||||
movdqu xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
|
||||
mpsadbw xmm2, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
mpsadbw xmm2, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
|
||||
add %1, %3
|
||||
add %2, %4
|
||||
%endmacro ; end of SAD_8x8_LINE_SSE41
|
||||
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
||||
movdqu xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqa xmm2, xmm1
|
||||
add %1, %3
|
||||
add %2, %4
|
||||
%endmacro ; end of SAD_8x8_LINE_SSE41
|
||||
%macro SAD_8x8_LINE_SSE41E 4 ; src, ref, stride_src, stride_ref
|
||||
movdqu xmm0, [%1]
|
||||
movdqu xmm1, [%2]
|
||||
movdqa xmm2, xmm1
|
||||
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
mpsadbw xmm1, xmm0, 0 ; 000 B
|
||||
paddw xmm7, xmm1 ; accumulate cost
|
||||
|
||||
mpsadbw xmm2, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
%endmacro ; end of SAD_8x8_LINE_SSE41E
|
||||
mpsadbw xmm2, xmm0, 5 ; 101 B
|
||||
paddw xmm7, xmm2 ; accumulate cost
|
||||
%endmacro ; end of SAD_8x8_LINE_SSE41E
|
||||
|
||||
WELS_EXTERN SampleSad8x8Hor8_sse41
|
||||
%assign push_num 0
|
||||
LOAD_6_PARA
|
||||
PUSH_XMM 8
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
movdqa xmm7, [r4] ; load base cost list
|
||||
SIGN_EXTENSION r1, r1d
|
||||
SIGN_EXTENSION r3, r3d
|
||||
movdqa xmm7, [r4] ; load base cost list
|
||||
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41 r0, r2, r1, r3
|
||||
SAD_8x8_LINE_SSE41E r0, r2, r1, r3
|
||||
|
||||
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
|
||||
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
|
||||
mov r1d, retrd
|
||||
and retrd, 0xFFFF
|
||||
sar r1d, 16
|
||||
mov [r5], r1d
|
||||
phminposuw xmm0, xmm7 ; horizon search the minimal sad cost and its index
|
||||
movd retrd, xmm0 ; for return: DEST[15:0] <- MIN, DEST[31:16] <- INDEX
|
||||
mov r1d, retrd
|
||||
and retrd, 0xFFFF
|
||||
sar r1d, 16
|
||||
mov [r5], r1d
|
||||
|
||||
POP_XMM
|
||||
LOAD_6_PARA_POP
|
||||
|
@ -104,32 +104,32 @@ db 6,7,6,7,7,8
|
||||
|
||||
align 16
|
||||
high_mask_table:
|
||||
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
|
||||
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
|
||||
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
|
||||
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
|
||||
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
|
||||
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
|
||||
db 9,11,12,15, 0, 1, 1, 4, 1, 3
|
||||
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
|
||||
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
|
||||
db 7,10, 8,10,11,14, 3, 4, 4, 7
|
||||
db 5, 7, 8,11, 5, 7, 7,10, 8,10
|
||||
db 11,14, 6, 7, 8,11, 8,10,11,14
|
||||
db 9,11,11,14,12,14,15,18, 0, 0
|
||||
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
|
||||
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
|
||||
db 7,10, 5, 7, 7,10, 8,10,11,14
|
||||
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
|
||||
db 6, 9, 7, 9,10,13, 5, 6, 7,10
|
||||
db 7, 9,10,13, 8,10,10,13,11,13
|
||||
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
|
||||
db 5, 7, 7,10, 8,10,11,14, 5, 6
|
||||
db 7,10, 7, 9,10,13, 8,10,10,13
|
||||
db 11,13,14,17, 6, 7, 7,10, 8,10
|
||||
db 11,14, 8,10,10,13,11,13,14,17
|
||||
db 9,10,11,14,11,13,14,17,12,14
|
||||
db 14,17,15,17,18,21
|
||||
db 0, 0, 0, 3, 0, 2, 3, 6, 0, 2
|
||||
db 2, 5, 3, 5, 6, 9, 0, 1, 2, 5
|
||||
db 2, 4, 5, 8, 3, 5, 5, 8, 6, 8
|
||||
db 9,12, 0, 1, 1, 4, 2, 4, 5, 8
|
||||
db 2, 4, 4, 7, 5, 7, 8,11, 3, 4
|
||||
db 5, 8, 5, 7, 8,11, 6, 8, 8,11
|
||||
db 9,11,12,15, 0, 1, 1, 4, 1, 3
|
||||
db 4, 7, 2, 4, 4, 7, 5, 7, 8,11
|
||||
db 2, 3, 4, 7, 4, 6, 7,10, 5, 7
|
||||
db 7,10, 8,10,11,14, 3, 4, 4, 7
|
||||
db 5, 7, 8,11, 5, 7, 7,10, 8,10
|
||||
db 11,14, 6, 7, 8,11, 8,10,11,14
|
||||
db 9,11,11,14,12,14,15,18, 0, 0
|
||||
db 1, 4, 1, 3, 4, 7, 1, 3, 3, 6
|
||||
db 4, 6, 7,10, 2, 3, 4, 7, 4, 6
|
||||
db 7,10, 5, 7, 7,10, 8,10,11,14
|
||||
db 2, 3, 3, 6, 4, 6, 7,10, 4, 6
|
||||
db 6, 9, 7, 9,10,13, 5, 6, 7,10
|
||||
db 7, 9,10,13, 8,10,10,13,11,13
|
||||
db 14,17, 3, 4, 4, 7, 4, 6, 7,10
|
||||
db 5, 7, 7,10, 8,10,11,14, 5, 6
|
||||
db 7,10, 7, 9,10,13, 8,10,10,13
|
||||
db 11,13,14,17, 6, 7, 7,10, 8,10
|
||||
db 11,14, 8,10,10,13,11,13,14,17
|
||||
db 9,10,11,14,11,13,14,17,12,14
|
||||
db 14,17,15,17,18,21
|
||||
|
||||
align 16
|
||||
low_mask_table:
|
||||
@ -167,173 +167,173 @@ SECTION .text
|
||||
;void WelsScan4x4DcAc_sse2( int16_t level[16], int16_t *pDct )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsScan4x4DcAc_sse2
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_2_PARA
|
||||
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
|
||||
movdqa xmm1, [r1+16] ; f e d c b a 9 8
|
||||
pextrw r2d, xmm0, 7 ; ecx = 7
|
||||
pextrw r3d, xmm1, 2 ; edx = a
|
||||
pextrw r1d, xmm0, 5 ; eax = 5
|
||||
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
|
||||
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
|
||||
pextrw r2d, xmm1, 0 ; ecx = 8
|
||||
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
|
||||
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
|
||||
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
|
||||
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
|
||||
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
|
||||
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
|
||||
movdqa [r0],xmm0
|
||||
movdqa [r0+16], xmm1
|
||||
%ifdef X86_32
|
||||
pop r3
|
||||
%endif
|
||||
ret
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_2_PARA
|
||||
movdqa xmm0, [r1] ; 7 6 5 4 3 2 1 0
|
||||
movdqa xmm1, [r1+16] ; f e d c b a 9 8
|
||||
pextrw r2d, xmm0, 7 ; ecx = 7
|
||||
pextrw r3d, xmm1, 2 ; edx = a
|
||||
pextrw r1d, xmm0, 5 ; eax = 5
|
||||
pinsrw xmm1, r2d, 2 ; f e d c b 7 9 8
|
||||
pinsrw xmm0, r1d, 7 ; 5 6 5 4 3 2 1 0
|
||||
pextrw r2d, xmm1, 0 ; ecx = 8
|
||||
pinsrw xmm0, r2d, 5 ; 5 6 8 4 3 2 1 0
|
||||
pinsrw xmm1, r3d, 0 ; f e d c b 7 9 a
|
||||
pshufd xmm2, xmm0, 0xd8 ; 5 6 3 2 8 4 1 0
|
||||
pshufd xmm3, xmm1, 0xd8 ; f e b 7 d c 9 a
|
||||
pshufhw xmm0, xmm2, 0x93 ; 6 3 2 5 8 4 1 0
|
||||
pshuflw xmm1, xmm3, 0x39 ; f e b 7 a d c 9
|
||||
movdqa [r0],xmm0
|
||||
movdqa [r0+16], xmm1
|
||||
%ifdef X86_32
|
||||
pop r3
|
||||
%endif
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
;void WelsScan4x4DcAc_ssse3( int16_t level[16], int16_t *pDct )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsScan4x4DcAc_ssse3
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
movdqa xmm0, [r1]
|
||||
movdqa xmm1, [r1+16]
|
||||
pextrw r2d, xmm0, 7 ; ecx = [7]
|
||||
pextrw r1d, xmm1, 0 ; eax = [8]
|
||||
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
|
||||
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
|
||||
pshufb xmm1, [pb_scanacdc_maskb]
|
||||
pshufb xmm0, [pb_scanacdc_maska]
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
movdqa xmm0, [r1]
|
||||
movdqa xmm1, [r1+16]
|
||||
pextrw r2d, xmm0, 7 ; ecx = [7]
|
||||
pextrw r1d, xmm1, 0 ; eax = [8]
|
||||
pinsrw xmm0, r1d, 7 ; xmm0[7] = [8]
|
||||
pinsrw xmm1, r2d, 0 ; xmm1[0] = [7]
|
||||
pshufb xmm1, [pb_scanacdc_maskb]
|
||||
pshufb xmm0, [pb_scanacdc_maska]
|
||||
|
||||
movdqa [r0],xmm0
|
||||
movdqa [r0+16], xmm1
|
||||
ret
|
||||
movdqa [r0],xmm0
|
||||
movdqa [r0+16], xmm1
|
||||
ret
|
||||
;***********************************************************************
|
||||
;void WelsScan4x4Ac_sse2( int16_t* zig_value, int16_t* pDct )
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsScan4x4Ac_sse2
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
movdqa xmm0, [r1]
|
||||
movdqa xmm1, [r1+16]
|
||||
movdqa xmm2, xmm0
|
||||
punpcklqdq xmm0, xmm1
|
||||
punpckhqdq xmm2, xmm1
|
||||
%assign push_num 0
|
||||
LOAD_2_PARA
|
||||
movdqa xmm0, [r1]
|
||||
movdqa xmm1, [r1+16]
|
||||
movdqa xmm2, xmm0
|
||||
punpcklqdq xmm0, xmm1
|
||||
punpckhqdq xmm2, xmm1
|
||||
|
||||
movdqa xmm3, xmm0
|
||||
punpckldq xmm0, xmm2
|
||||
punpckhdq xmm3, xmm2
|
||||
pextrw r1d , xmm0, 3
|
||||
pextrw r2d , xmm0, 7
|
||||
pinsrw xmm0, r1d, 7
|
||||
pextrw r1d, xmm3, 4
|
||||
pinsrw xmm3, r2d, 4
|
||||
pextrw r2d, xmm3, 0
|
||||
pinsrw xmm3, r1d, 0
|
||||
pinsrw xmm0, r2d, 3
|
||||
movdqa xmm3, xmm0
|
||||
punpckldq xmm0, xmm2
|
||||
punpckhdq xmm3, xmm2
|
||||
pextrw r1d , xmm0, 3
|
||||
pextrw r2d , xmm0, 7
|
||||
pinsrw xmm0, r1d, 7
|
||||
pextrw r1d, xmm3, 4
|
||||
pinsrw xmm3, r2d, 4
|
||||
pextrw r2d, xmm3, 0
|
||||
pinsrw xmm3, r1d, 0
|
||||
pinsrw xmm0, r2d, 3
|
||||
|
||||
pshufhw xmm1, xmm0, 0x93
|
||||
pshuflw xmm2, xmm3, 0x39
|
||||
pshufhw xmm1, xmm0, 0x93
|
||||
pshuflw xmm2, xmm3, 0x39
|
||||
|
||||
movdqa xmm3, xmm2
|
||||
psrldq xmm1, 2
|
||||
pslldq xmm3, 14
|
||||
por xmm1, xmm3
|
||||
psrldq xmm2, 2
|
||||
movdqa [r0],xmm1
|
||||
movdqa [r0+16], xmm2
|
||||
ret
|
||||
movdqa [r0],xmm1
|
||||
movdqa [r0+16], xmm2
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
;void int32_t WelsCalculateSingleCtr4x4_sse2( int16_t *pDct );
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsCalculateSingleCtr4x4_sse2
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
movdqa xmm1, [r0+16]
|
||||
%ifdef X86_32
|
||||
push r3
|
||||
%assign push_num 1
|
||||
%else
|
||||
%assign push_num 0
|
||||
%endif
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
movdqa xmm1, [r0+16]
|
||||
|
||||
packsswb xmm0, xmm1
|
||||
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
|
||||
xor r3, r3
|
||||
packsswb xmm0, xmm1
|
||||
; below is the register map: r0 - eax, r1 - ebx, r2 - ecx, r3 - edx
|
||||
xor r3, r3
|
||||
pxor xmm3, xmm3
|
||||
pcmpeqb xmm0, xmm3
|
||||
pmovmskb r3d, xmm0
|
||||
|
||||
xor r3, 0xffff
|
||||
|
||||
xor r0, r0
|
||||
mov r2, 7
|
||||
mov r1, 8
|
||||
xor r0, r0
|
||||
mov r2, 7
|
||||
mov r1, 8
|
||||
.loop_low8_find1:
|
||||
bt r3, r2
|
||||
jc .loop_high8_find1
|
||||
dec r2
|
||||
jnz .loop_low8_find1
|
||||
bt r3, r2
|
||||
jc .loop_high8_find1
|
||||
dec r2
|
||||
jnz .loop_low8_find1
|
||||
.loop_high8_find1:
|
||||
bt r3, r1
|
||||
jc .find1end
|
||||
inc r1
|
||||
cmp r1,16
|
||||
jb .loop_high8_find1
|
||||
bt r3, r1
|
||||
jc .find1end
|
||||
inc r1
|
||||
cmp r1,16
|
||||
jb .loop_high8_find1
|
||||
.find1end:
|
||||
sub r1, r2
|
||||
sub r1, 1
|
||||
lea r2, [i_ds_table]
|
||||
add r0b, [r2+r1]
|
||||
mov r1, r3
|
||||
and r3, 0xff
|
||||
shr r1, 8
|
||||
and r1, 0xff
|
||||
lea r2 , [low_mask_table]
|
||||
add r0b, [r2 +r3]
|
||||
lea r2, [high_mask_table]
|
||||
add r0b, [r2+r1]
|
||||
%ifdef X86_32
|
||||
pop r3
|
||||
%else
|
||||
mov retrd, r0d
|
||||
%endif
|
||||
ret
|
||||
sub r1, r2
|
||||
sub r1, 1
|
||||
lea r2, [i_ds_table]
|
||||
add r0b, [r2+r1]
|
||||
mov r1, r3
|
||||
and r3, 0xff
|
||||
shr r1, 8
|
||||
and r1, 0xff
|
||||
lea r2 , [low_mask_table]
|
||||
add r0b, [r2 +r3]
|
||||
lea r2, [high_mask_table]
|
||||
add r0b, [r2+r1]
|
||||
%ifdef X86_32
|
||||
pop r3
|
||||
%else
|
||||
mov retrd, r0d
|
||||
%endif
|
||||
ret
|
||||
|
||||
|
||||
;***********************************************************************
|
||||
; int32_t WelsGetNoneZeroCount_sse2(int16_t* level);
|
||||
;***********************************************************************
|
||||
WELS_EXTERN WelsGetNoneZeroCount_sse2
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
movdqa xmm1, [r0+16]
|
||||
pxor xmm2, xmm2
|
||||
pcmpeqw xmm0, xmm2
|
||||
pcmpeqw xmm1, xmm2
|
||||
packsswb xmm1, xmm0
|
||||
xor r1, r1
|
||||
pmovmskb r1d, xmm1
|
||||
xor r1d, 0xffff
|
||||
mov r2, r1
|
||||
and r1, 0xff
|
||||
shr r2, 8
|
||||
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
|
||||
; xor retr, retr
|
||||
;add al, [nozero_count_table+r2]
|
||||
lea r0 , [nozero_count_table]
|
||||
movzx r2, byte [r0+r2]
|
||||
movzx r1, byte [r0+r1]
|
||||
mov retrq, r2
|
||||
add retrq, r1
|
||||
;add al, [nozero_count_table+r1]
|
||||
ret
|
||||
%assign push_num 0
|
||||
LOAD_1_PARA
|
||||
movdqa xmm0, [r0]
|
||||
movdqa xmm1, [r0+16]
|
||||
pxor xmm2, xmm2
|
||||
pcmpeqw xmm0, xmm2
|
||||
pcmpeqw xmm1, xmm2
|
||||
packsswb xmm1, xmm0
|
||||
xor r1, r1
|
||||
pmovmskb r1d, xmm1
|
||||
xor r1d, 0xffff
|
||||
mov r2, r1
|
||||
and r1, 0xff
|
||||
shr r2, 8
|
||||
; and ecx, 0xff ; we do not need this due to high 16bits equal to 0 yet
|
||||
; xor retr, retr
|
||||
;add al, [nozero_count_table+r2]
|
||||
lea r0 , [nozero_count_table]
|
||||
movzx r2, byte [r0+r2]
|
||||
movzx r1, byte [r0+r1]
|
||||
mov retrq, r2
|
||||
add retrq, r1
|
||||
;add al, [nozero_count_table+r1]
|
||||
ret
|
||||
|
||||
|
@ -36,17 +36,17 @@
|
||||
|
||||
#ifdef __APPLE__
|
||||
.macro SQR_ADD_16BYTES
|
||||
vmull.u8 q3, $0, $0
|
||||
vmull.u8 q8, $1, $1
|
||||
vpadal.u16 $2, q3
|
||||
vpadal.u16 $2, q8
|
||||
vmull.u8 q3, $0, $0
|
||||
vmull.u8 q8, $1, $1
|
||||
vpadal.u16 $2, q3
|
||||
vpadal.u16 $2, q8
|
||||
.endm
|
||||
#else
|
||||
.macro SQR_ADD_16BYTES arg0, arg1, arg2
|
||||
vmull.u8 q3, \arg0, \arg0
|
||||
vmull.u8 q8, \arg1, \arg1
|
||||
vpadal.u16 \arg2, q3
|
||||
vpadal.u16 \arg2, q8
|
||||
vmull.u8 q3, \arg0, \arg0
|
||||
vmull.u8 q8, \arg1, \arg1
|
||||
vpadal.u16 \arg2, q3
|
||||
vpadal.u16 \arg2, q8
|
||||
.endm
|
||||
#endif
|
||||
|
||||
@ -54,66 +54,66 @@
|
||||
WELS_ASM_FUNC_BEGIN SampleVariance16x16_neon
|
||||
stmdb sp!, {r4}
|
||||
|
||||
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
|
||||
vld1.8 {q15}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q14}, [r2], r3 //save the src data (16bytes)
|
||||
|
||||
|
||||
vabd.u8 q13, q14, q15
|
||||
vmull.u8 q12, d27, d27
|
||||
vmull.u8 q11, d26, d26
|
||||
vaddl.u16 q12, d24, d25
|
||||
vpadal.u16 q12, q11 //sqr
|
||||
vabd.u8 q13, q14, q15
|
||||
vmull.u8 q12, d27, d27
|
||||
vmull.u8 q11, d26, d26
|
||||
vaddl.u16 q12, d24, d25
|
||||
vpadal.u16 q12, q11 //sqr
|
||||
|
||||
vaddl.u8 q13, d26, d27 //sum
|
||||
|
||||
vaddl.u8 q10, d28, d29 //sum_cur
|
||||
vaddl.u8 q10, d28, d29 //sum_cur
|
||||
|
||||
vmull.u8 q9, d29, d29
|
||||
vmull.u8 q8, d28, d28
|
||||
vaddl.u16 q9, d18, d19 //sqr_cur
|
||||
vpadal.u16 q9, q8
|
||||
vmull.u8 q9, d29, d29
|
||||
vmull.u8 q8, d28, d28
|
||||
vaddl.u16 q9, d18, d19 //sqr_cur
|
||||
vpadal.u16 q9, q8
|
||||
|
||||
mov r4, #15
|
||||
mov r4, #15
|
||||
pixel_var_16x16_loop0:
|
||||
|
||||
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
|
||||
vld1.8 {q0}, [r0], r1 //save the ref data (16bytes)
|
||||
vld1.8 {q1}, [r2], r3 //save the src data (16bytes)
|
||||
|
||||
vabd.u8 q2, q0, q1
|
||||
vabd.u8 q2, q0, q1
|
||||
|
||||
//q10 save sum_cur
|
||||
vpadal.u8 q10, q1
|
||||
//q10 save sum_cur
|
||||
vpadal.u8 q10, q1
|
||||
|
||||
//q12 save sqr
|
||||
SQR_ADD_16BYTES d4, d5, q12
|
||||
//q12 save sqr
|
||||
SQR_ADD_16BYTES d4, d5, q12
|
||||
|
||||
//q13 save sum
|
||||
vpadal.u8 q13, q2
|
||||
vpadal.u8 q13, q2
|
||||
|
||||
subs r4, #1
|
||||
subs r4, #1
|
||||
|
||||
//q9 save sqr_cur
|
||||
SQR_ADD_16BYTES d2, d3, q9
|
||||
//q9 save sqr_cur
|
||||
SQR_ADD_16BYTES d2, d3, q9
|
||||
|
||||
bne pixel_var_16x16_loop0
|
||||
bne pixel_var_16x16_loop0
|
||||
|
||||
vadd.u16 d0, d26, d27 //sum
|
||||
vadd.u16 d1, d20, d21 //sum_cur
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d24, d25 //sqr
|
||||
vadd.u32 d3, d18, d19 //sqr_cur
|
||||
vpadd.u32 d0, d0, d1
|
||||
vpadd.u32 d1, d2, d3
|
||||
vadd.u16 d0, d26, d27 //sum
|
||||
vadd.u16 d1, d20, d21 //sum_cur
|
||||
vpaddl.u16 q0, q0
|
||||
vadd.u32 d2, d24, d25 //sqr
|
||||
vadd.u32 d3, d18, d19 //sqr_cur
|
||||
vpadd.u32 d0, d0, d1
|
||||
vpadd.u32 d1, d2, d3
|
||||
|
||||
ldr r4, [sp, #4]
|
||||
ldr r4, [sp, #4]
|
||||
|
||||
vshr.u32 q0, q0, #8
|
||||
vmul.u32 d0, d0
|
||||
vsub.u32 d0, d1, d0
|
||||
vshr.u32 q0, q0, #8
|
||||
vmul.u32 d0, d0
|
||||
vsub.u32 d0, d1, d0
|
||||
vmovl.u32 q0, d0
|
||||
vst2.16 {d0[0], d1[0]}, [r4]
|
||||
vst2.16 {d0[0], d1[0]}, [r4]
|
||||
|
||||
ldmia sp!, {r4}
|
||||
ldmia sp!, {r4}
|
||||
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
@ -30,313 +30,313 @@
|
||||
*
|
||||
*/
|
||||
|
||||
#ifdef HAVE_NEON
|
||||
#ifdef HAVE_NEON
|
||||
.text
|
||||
#include "arm_arch_common_macro.S"
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsampler_neon
|
||||
stmdb sp!, {r4-r8, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #24] //src_width
|
||||
ldr r5, [sp, #28] //src_height
|
||||
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
lsr r5, #1
|
||||
//Initialize the register
|
||||
mov r6, r2
|
||||
mov r8, r0
|
||||
mov lr, #0
|
||||
lsr r5, #1
|
||||
|
||||
//Save the tailer for the unasigned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
//Save the tailer for the unasigned size
|
||||
mla r7, r1, r5, r0
|
||||
vld1.32 {q15}, [r7]
|
||||
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
add r7, r2, r3
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_loop0:
|
||||
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
add lr, #32
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
add lr, #32
|
||||
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r6, r3, lsl #1
|
||||
movcs r2, r6
|
||||
addcs r7, r2, r3
|
||||
addcs r8, r1
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_loop0
|
||||
cmp lr, r4
|
||||
movcs lr, #0
|
||||
addcs r6, r6, r3, lsl #1
|
||||
movcs r2, r6
|
||||
addcs r7, r2, r3
|
||||
addcs r8, r1
|
||||
movcs r0, r8
|
||||
subscs r5, #1
|
||||
bne comp_ds_bilinear_loop0
|
||||
|
||||
//restore the tailer for the unasigned size
|
||||
vst1.32 {q15}, [r0]
|
||||
//restore the tailer for the unasigned size
|
||||
vst1.32 {q15}, [r0]
|
||||
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
ldmia sp!, {r4-r8,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x8_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
lsr r5, #1
|
||||
lsr r5, #1
|
||||
|
||||
//processing a colume data
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x8_loop0:
|
||||
|
||||
lsr r6, r4, #3
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
lsr r6, r4, #3
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x8_loop1:
|
||||
|
||||
vld1.8 {d0}, [r2]!
|
||||
vld1.8 {d1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vrshr.u16 q0, #1
|
||||
vrhadd.u16 d0, d1
|
||||
vld1.8 {d0}, [r2]!
|
||||
vld1.8 {d1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vrshr.u16 q0, #1
|
||||
vrhadd.u16 d0, d1
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x8_loop1
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0[0]}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x8_loop1
|
||||
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x8_loop0
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x8_loop0
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_BEGIN comp_ds_bilinear_w_x16_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
lsr r5, #1
|
||||
lsr r5, #1
|
||||
|
||||
//processing a colume data
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x16_loop0:
|
||||
|
||||
lsr r6, r4, #4
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
lsr r6, r4, #4
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x16_loop1:
|
||||
|
||||
vld1.8 {q0}, [r2]!
|
||||
vld1.8 {q1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrhadd.u16 q0, q1
|
||||
vld1.8 {q0}, [r2]!
|
||||
vld1.8 {q1}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrhadd.u16 q0, q1
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x16_loop1
|
||||
vmovn.u16 d0, q0
|
||||
vst1.32 {d0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x16_loop1
|
||||
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x16_loop0
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x16_loop0
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
WELS_ASM_FUNC_BEGIN DyadicBilinearDownsamplerWidthx32_neon
|
||||
stmdb sp!, {r4-r7, lr}
|
||||
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
//Get the width and height
|
||||
ldr r4, [sp, #20] //src_width
|
||||
ldr r5, [sp, #24] //src_height
|
||||
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
//Get the difference
|
||||
sub lr, r3, r4
|
||||
sub r1, r1, r4, lsr #1
|
||||
|
||||
lsr r5, #1
|
||||
lsr r5, #1
|
||||
|
||||
//processing a colume data
|
||||
//processing a colume data
|
||||
comp_ds_bilinear_w_x32_loop0:
|
||||
|
||||
lsr r6, r4, #5
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
lsr r6, r4, #5
|
||||
add r7, r2, r3
|
||||
//processing a line data
|
||||
comp_ds_bilinear_w_x32_loop1:
|
||||
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
vld1.8 {q0,q1}, [r2]!
|
||||
vld1.8 {q2,q3}, [r7]!
|
||||
vpaddl.u8 q0, q0
|
||||
vpaddl.u8 q1, q1
|
||||
vpaddl.u8 q2, q2
|
||||
vpaddl.u8 q3, q3
|
||||
vrshr.u16 q0, #1
|
||||
vrshr.u16 q1, #1
|
||||
vrshr.u16 q2, #1
|
||||
vrshr.u16 q3, #1
|
||||
vrhadd.u16 q0, q2
|
||||
vrhadd.u16 q1, q3
|
||||
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x32_loop1
|
||||
vmovn.u16 d0, q0
|
||||
vmovn.u16 d1, q1
|
||||
vst1.32 {q0}, [r0]!
|
||||
subs r6, #1
|
||||
bne comp_ds_bilinear_w_x32_loop1
|
||||
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x32_loop0
|
||||
add r2, r7, lr
|
||||
add r0, r1
|
||||
subs r5, #1
|
||||
bne comp_ds_bilinear_w_x32_loop0
|
||||
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
ldmia sp!, {r4-r7,lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
|
||||
WELS_ASM_FUNC_BEGIN GeneralBilinearAccurateDownsampler_neon
|
||||
stmdb sp!, {r4-r12, lr}
|
||||
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
ldr r5, [sp, #44] //the value of src_stride
|
||||
//Get the data from stack
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
ldr r5, [sp, #44] //the value of src_stride
|
||||
ldr r6, [sp, #48] //the value of scaleX
|
||||
ldr r7, [sp, #52] //the value of scaleY
|
||||
|
||||
mov r10, #32768
|
||||
sub r10, #1
|
||||
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
|
||||
and r8, r6, r10 // r8 uinc(scaleX mod 32767)
|
||||
mov r11, #-1
|
||||
mul r11, r8 // r11 -uinc
|
||||
mul r11, r8 // r11 -uinc
|
||||
|
||||
vdup.s16 d2, r8
|
||||
vdup.s16 d0, r11
|
||||
vzip.s16 d0, d2 // uinc -uinc uinc -uinc
|
||||
|
||||
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
|
||||
and r9, r7, r10 // r9 vinc(scaleY mod 32767)
|
||||
mov r11, #-1
|
||||
mul r11, r9 // r11 -vinc
|
||||
mul r11, r9 // r11 -vinc
|
||||
|
||||
vdup.s16 d2, r9
|
||||
vdup.s16 d3, r11
|
||||
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
|
||||
vdup.s16 d2, r9
|
||||
vdup.s16 d3, r11
|
||||
vext.8 d5, d3, d2, #4 // vinc vinc -vinc -vinc
|
||||
|
||||
mov r11, #0x40000000
|
||||
mov r11, #0x40000000
|
||||
mov r12, #0x4000
|
||||
sub r12, #1
|
||||
add r11, r12
|
||||
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
|
||||
vdup.s32 d1, r11; //init u 16384 16383 16384 16383
|
||||
|
||||
mov r11, #16384
|
||||
mov r11, #16384
|
||||
vdup.s16 d16, r11
|
||||
sub r11, #1
|
||||
vdup.s16 d17, r11
|
||||
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
|
||||
vdup.s16 d17, r11
|
||||
vext.8 d7, d17, d16, #4 //init v 16384 16384 16383 16383
|
||||
|
||||
veor q14, q14
|
||||
sub r1, r2 // stride - width
|
||||
mov r8, #16384 // yInverse
|
||||
sub r3, #1
|
||||
veor q14, q14
|
||||
sub r1, r2 // stride - width
|
||||
mov r8, #16384 // yInverse
|
||||
sub r3, #1
|
||||
|
||||
_HEIGHT:
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
mov r11, r8
|
||||
lsr r11, #15
|
||||
mul r11, r5
|
||||
add r11, r4 // get current row address
|
||||
mov r12, r11
|
||||
add r12, r5
|
||||
mov r11, r8
|
||||
lsr r11, #15
|
||||
mul r11, r5
|
||||
add r11, r4 // get current row address
|
||||
mov r12, r11
|
||||
add r12, r5
|
||||
|
||||
mov r9, #16384 // xInverse
|
||||
sub r10, r2, #1
|
||||
mov r9, #16384 // xInverse
|
||||
sub r10, r2, #1
|
||||
vmov.s16 d6, d1
|
||||
|
||||
_WIDTH:
|
||||
mov lr, r9
|
||||
lsr lr, #15
|
||||
mov lr, r9
|
||||
lsr lr, #15
|
||||
add r4, r11,lr
|
||||
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
|
||||
vld2.8 {d28[0],d29[0]}, [r4] //q14: 0000000b0000000a;
|
||||
add r4, r12,lr
|
||||
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
|
||||
vzip.32 d28, d29 //q14: 000d000c000b000a;
|
||||
vld2.8 {d28[4],d29[4]}, [r4] //q14: 000d000b000c000a;
|
||||
vzip.32 d28, d29 //q14: 000d000c000b000a;
|
||||
|
||||
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||
vmull.u32 q12, d26,d28
|
||||
vmlal.u32 q12, d27,d29
|
||||
vqadd.u64 d24, d24,d25
|
||||
vrshr.u64 d24, #30
|
||||
vmull.u16 q13, d6, d7 //q13: init u * init v
|
||||
vmull.u32 q12, d26,d28
|
||||
vmlal.u32 q12, d27,d29
|
||||
vqadd.u64 d24, d24,d25
|
||||
vrshr.u64 d24, #30
|
||||
|
||||
vst1.8 {d24[0]}, [r0]!
|
||||
add r9, r6
|
||||
vadd.u16 d6, d0 // inc u
|
||||
vshl.u16 d6, #1
|
||||
vshr.u16 d6, #1
|
||||
subs r10, #1
|
||||
bne _WIDTH
|
||||
vst1.8 {d24[0]}, [r0]!
|
||||
add r9, r6
|
||||
vadd.u16 d6, d0 // inc u
|
||||
vshl.u16 d6, #1
|
||||
vshr.u16 d6, #1
|
||||
subs r10, #1
|
||||
bne _WIDTH
|
||||
|
||||
WIDTH_END:
|
||||
lsr r9, #15
|
||||
lsr r9, #15
|
||||
add r4,r11,r9
|
||||
vld1.8 {d24[0]}, [r4]
|
||||
vst1.8 {d24[0]}, [r0]
|
||||
add r0, #1
|
||||
add r8, r7
|
||||
add r0, r1
|
||||
vadd.s16 d7, d5 // inc v
|
||||
vshl.u16 d7, #1
|
||||
vshr.u16 d7, #1
|
||||
subs r3, #1
|
||||
bne _HEIGHT
|
||||
vld1.8 {d24[0]}, [r4]
|
||||
vst1.8 {d24[0]}, [r0]
|
||||
add r0, #1
|
||||
add r8, r7
|
||||
add r0, r1
|
||||
vadd.s16 d7, d5 // inc v
|
||||
vshl.u16 d7, #1
|
||||
vshr.u16 d7, #1
|
||||
subs r3, #1
|
||||
bne _HEIGHT
|
||||
|
||||
LAST_ROW:
|
||||
ldr r4, [sp, #40] //the addr of src
|
||||
lsr r8, #15
|
||||
mul r8, r5
|
||||
add r4, r8 // get current row address
|
||||
mov r9, #16384
|
||||
lsr r8, #15
|
||||
mul r8, r5
|
||||
add r4, r8 // get current row address
|
||||
mov r9, #16384
|
||||
|
||||
_LAST_ROW_WIDTH:
|
||||
mov r11, r9
|
||||
lsr r11, #15
|
||||
mov r11, r9
|
||||
lsr r11, #15
|
||||
|
||||
add r3, r4,r11
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
vst1.8 {d0[0]}, [r0]
|
||||
add r0, #1
|
||||
add r9, r6
|
||||
subs r2, #1
|
||||
bne _LAST_ROW_WIDTH
|
||||
add r3, r4,r11
|
||||
vld1.8 {d0[0]}, [r3]
|
||||
vst1.8 {d0[0]}, [r0]
|
||||
add r0, #1
|
||||
add r9, r6
|
||||
subs r2, #1
|
||||
bne _LAST_ROW_WIDTH
|
||||
|
||||
ldmia sp!, {r4-r12, lr}
|
||||
ldmia sp!, {r4-r12, lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
@ -37,32 +37,32 @@
|
||||
|
||||
WELS_ASM_FUNC_BEGIN WelsProcessingSampleSad8x8_neon
|
||||
stmdb sp!, {lr}
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q1, d0, d1
|
||||
//Do the SAD for 8 bytes
|
||||
vabdl.u8 q1, d0, d1
|
||||
|
||||
mov lr, #7
|
||||
mov lr, #7
|
||||
pixel_sad_8x8_loop0:
|
||||
|
||||
//Loading a horizontal line data (8 bytes)
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
vld1.8 {d0}, [r0], r1
|
||||
vld1.8 {d1}, [r2], r3
|
||||
|
||||
subs lr, #1
|
||||
subs lr, #1
|
||||
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q1, d0, d1
|
||||
bne pixel_sad_8x8_loop0
|
||||
//Do the SAD for 8 bytes
|
||||
vabal.u8 q1, d0, d1
|
||||
bne pixel_sad_8x8_loop0
|
||||
|
||||
vadd.u16 d2, d3
|
||||
vpaddl.u16 d2, d2
|
||||
vpaddl.u32 d2, d2
|
||||
vmov.u32 r0, d2[0]//TBO...
|
||||
vadd.u16 d2, d3
|
||||
vpaddl.u16 d2, d2
|
||||
vpaddl.u32 d2, d2
|
||||
vmov.u32 r0, d2[0]//TBO...
|
||||
|
||||
ldmia sp!, {lr}
|
||||
ldmia sp!, {lr}
|
||||
WELS_ASM_FUNC_END
|
||||
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -56,217 +56,217 @@ sse2_20 times 8 dw 20
|
||||
;***********************************************************************
|
||||
SECTION .text
|
||||
|
||||
%macro WEIGHT_LINE 9
|
||||
movq %2, %9
|
||||
punpcklbw %2, %7
|
||||
movdqa %8, %2
|
||||
%macro WEIGHT_LINE 9
|
||||
movq %2, %9
|
||||
punpcklbw %2, %7
|
||||
movdqa %8, %2
|
||||
|
||||
movdqa %1, %6
|
||||
psubusb %1, %8
|
||||
psubusb %8, %6
|
||||
por %8, %1 ; ABS(curPixel - centerPixel);
|
||||
movdqa %1, %6
|
||||
psubusb %1, %8
|
||||
psubusb %8, %6
|
||||
por %8, %1 ; ABS(curPixel - centerPixel);
|
||||
|
||||
movdqa %1, %3
|
||||
psubusb %1, %8
|
||||
movdqa %1, %3
|
||||
psubusb %1, %8
|
||||
|
||||
pmullw %1, %1
|
||||
psrlw %1, 5
|
||||
pmullw %2, %1
|
||||
paddusw %4, %1
|
||||
paddusw %5, %2
|
||||
pmullw %1, %1
|
||||
psrlw %1, 5
|
||||
pmullw %2, %1
|
||||
paddusw %4, %1
|
||||
paddusw %5, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE1_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%macro WEIGHT_LINE1_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE2_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%macro WEIGHT_LINE2_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
%macro WEIGHT_LINE3_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
%macro WEIGHT_LINE3_UV 4
|
||||
movdqa %2, %1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 1
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
pmullw %2, [sse2_20]
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 2
|
||||
punpcklbw %2, %4
|
||||
pmullw %2, [sse2_20]
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 3
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 2
|
||||
paddw %3, %2
|
||||
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
movdqa %2, %1
|
||||
psrldq %2, 4
|
||||
punpcklbw %2, %4
|
||||
psllw %2, 1
|
||||
paddw %3, %2
|
||||
%endmacro
|
||||
|
||||
;***********************************************************************
|
||||
; BilateralLumaFilter8_sse2(uint8_t *pixels, int stride);
|
||||
;***********************************************************************
|
||||
; 1 2 3
|
||||
; 4 0 5
|
||||
; 6 7 8
|
||||
; 0: the center point
|
||||
; 1 2 3
|
||||
; 4 0 5
|
||||
; 6 7 8
|
||||
; 0: the center point
|
||||
|
||||
WELS_EXTERN BilateralLumaFilter8_sse2
|
||||
|
||||
push r3
|
||||
%assign push_num 1
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
push r3
|
||||
%assign push_num 1
|
||||
LOAD_2_PARA
|
||||
PUSH_XMM 8
|
||||
|
||||
pxor xmm7, xmm7
|
||||
pxor xmm7, xmm7
|
||||
|
||||
mov r3, r0
|
||||
mov r3, r0
|
||||
|
||||
movq xmm6, [r0]
|
||||
punpcklbw xmm6, xmm7
|
||||
movdqa xmm3, [sse2_32]
|
||||
pxor xmm4, xmm4 ; nTotWeight
|
||||
pxor xmm5, xmm5 ; nSum
|
||||
movq xmm6, [r0]
|
||||
punpcklbw xmm6, xmm7
|
||||
movdqa xmm3, [sse2_32]
|
||||
pxor xmm4, xmm4 ; nTotWeight
|
||||
pxor xmm5, xmm5 ; nSum
|
||||
|
||||
dec r0
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
|
||||
dec r0
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 4
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 5
|
||||
|
||||
sub r0, r1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
|
||||
sub r0, r1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 1
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 2
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 3
|
||||
|
||||
lea r0, [r0 + r1 * 2]
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
|
||||
lea r0, [r0 + r1 * 2]
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0] ; pixel 6
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 1] ; pixel 7
|
||||
WEIGHT_LINE xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm0, [r0 + 2] ; pixel 8
|
||||
|
||||
pcmpeqw xmm0, xmm0
|
||||
psrlw xmm0, 15
|
||||
psllw xmm0, 8
|
||||
psubusw xmm0, xmm4
|
||||
pmullw xmm0, xmm6
|
||||
paddusw xmm5, xmm0
|
||||
psrlw xmm5, 8
|
||||
packuswb xmm5, xmm5
|
||||
movq [r3], xmm5
|
||||
pcmpeqw xmm0, xmm0
|
||||
psrlw xmm0, 15
|
||||
psllw xmm0, 8
|
||||
psubusw xmm0, xmm4
|
||||
pmullw xmm0, xmm6
|
||||
paddusw xmm5, xmm0
|
||||
psrlw xmm5, 8
|
||||
packuswb xmm5, xmm5
|
||||
movq [r3], xmm5
|
||||
|
||||
|
||||
POP_XMM
|
||||
pop r3
|
||||
%assign push_num 0
|
||||
POP_XMM
|
||||
pop r3
|
||||
%assign push_num 0
|
||||
|
||||
ret
|
||||
ret
|
||||
|
||||
;***********************************************************************
|
||||
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
|
||||
; void WaverageChromaFilter8_sse2(uint8_t *pixels, int stride);
|
||||
;***********************************************************************
|
||||
;5x5 filter:
|
||||
;1 1 2 1 1
|
||||
;1 2 4 2 1
|
||||
;2 4 20 4 2
|
||||
;1 2 4 2 1
|
||||
;1 1 2 1 1
|
||||
;1 1 2 1 1
|
||||
;1 2 4 2 1
|
||||
;2 4 20 4 2
|
||||
;1 2 4 2 1
|
||||
;1 1 2 1 1
|
||||
|
||||
WELS_EXTERN WaverageChromaFilter8_sse2
|
||||
|
||||
push r3
|
||||
push r3
|
||||
|
||||
%assign push_num 1
|
||||
%assign push_num 1
|
||||
|
||||
LOAD_2_PARA
|
||||
LOAD_2_PARA
|
||||
|
||||
mov r3, r1
|
||||
add r3, r3
|
||||
sub r0, r3 ; pixels - 2 * stride
|
||||
sub r0, 2
|
||||
mov r3, r1
|
||||
add r3, r3
|
||||
sub r0, r3 ; pixels - 2 * stride
|
||||
sub r0, 2
|
||||
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm3, xmm3
|
||||
pxor xmm0, xmm0
|
||||
pxor xmm3, xmm3
|
||||
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
add r0, r3
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
|
||||
add r0, r3
|
||||
movdqu xmm1, [r0]
|
||||
WEIGHT_LINE3_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0 + r1]
|
||||
WEIGHT_LINE2_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
movdqu xmm1, [r0 + r1 * 2]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
movdqu xmm1, [r0 + r1 * 2]
|
||||
WEIGHT_LINE1_UV xmm1, xmm2, xmm3, xmm0
|
||||
|
||||
psrlw xmm3, 6
|
||||
packuswb xmm3, xmm3
|
||||
movq [r0 + 2], xmm3
|
||||
psrlw xmm3, 6
|
||||
packuswb xmm3, xmm3
|
||||
movq [r0 + 2], xmm3
|
||||
|
||||
|
||||
pop r3
|
||||
pop r3
|
||||
|
||||
%assign push_num 0
|
||||
ret
|
||||
%assign push_num 0
|
||||
ret
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user